Beispiel #1
0
    def synctable(self, sourceDb, targetDb, sourceTable, targetTable):
        sourceCursor = sourceDb.cursor()
        targetCursor = targetDb.cursor()
        affected_total = 0
        init_rowCount = targetTable.rowCount if targetTable.rowCount < sourceTable.rowCount else sourceTable.rowCount
        pbar = tqdm(total=sourceTable.rowCount, unit='records')
        pbar.update(init_rowCount)
        while sourceTable.lastUpdatetime > targetTable.lastUpdatetime:
            affected_rows = 0
            batchSize = 100000
            sql = "SELECT * FROM (SELECT * FROM {schema}.{tablename} WHERE {timestamp}>=to_timestamp('{last_updatetime}','yyyy-mm-dd hh24:mi:ss.ff6') ORDER BY {timestamp}) WHERE ROWNUM<={batch_size}".format(
                timestamp=sourceTable.timestampField,
                schema=sourceTable.schema,
                tablename=sourceTable.tablename,
                last_updatetime=targetTable.lastUpdatetime,
                batch_size=batchSize)
            sourceRecord = etl.fromdb(lambda: CursorProxy(sourceDb.cursor()),
                                      sql)
            targetRecord = etl.fromdb(
                lambda: CursorProxy(targetDb.cursor()),
                "SELECT * FROM {schema}.{tablename} WHERE 1=0".format(
                    schema=targetTable.schema,
                    tablename=targetTable.tablename))
            sourceTable.columns = etl.header(sourceRecord)
            targetTable.columns = etl.header(targetRecord)
            for column in list(
                    set(sourceTable.columns) - set(targetTable.columns)):
                sourceRecord = etl.cutout(sourceRecord, column)
            max_updatetime = sourceRecord.cut(
                sourceTable.timestampField).skip(1).max()[0]
            sourceRecord = sourceRecord.sort(sourceTable.timestampField)
            etl.appenddb(sourceRecord,
                         CursorProxy(targetCursor),
                         targetTable.tablename,
                         schema=targetTable.schema,
                         commit=True)
            affected_rows += targetCursor.rowcount
            targetTable.lastUpdatetime = max_updatetime.strftime(
                '%Y-%m-%d %H:%M:%S.%f')
            targetTable.rowCount += affected_rows
            pbar.update(affected_rows if init_rowCount + affected_total +
                        affected_rows < sourceTable.rowCount else
                        sourceTable.rowCount - init_rowCount - affected_total)
            affected_total += affected_rows
            pbar.set_description("%s |%d records updated." %
                                 (targetTable.tablename, affected_total))

        if targetTable.lastUpdatetime > sourceTable.lastUpdatetime:
            pbar.set_description("%s |timestamp >, skip." %
                                 (targetTable.tablename))
        elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount == sourceTable.rowCount:
            pbar.set_description("%s |no data change." %
                                 (targetTable.tablename))
        elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount > sourceTable.rowCount:
            pbar.set_description("%s |RowCount > but timestamp ==, skip." %
                                 (targetTable.tablename))
        elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount < sourceTable.rowCount:
            pbar.set_description("%s |RowCount < but timestamp ==, skip." %
                                 (targetTable.tablename))
        pbar.close()
Beispiel #2
0
def get_delta(source_table, target_table, key='id'):
    source_table_headers = etl.header(source_table)
    target_table_headers = etl.header(target_table)

    if source_table_headers != target_table_headers:
        raise Exception(
            'Source table columns do not match target table columns')

    source_ids = etl.cut(source_table, key)
    target_ids = etl.cut(target_table, key)
    added_ids_table, _ = etl.diff(source_ids, target_ids)

    merged_table = etl.merge(source_table, target_table, key=key)

    load_frame = etl.todataframe(
        etl.selectin(target_table, key, etl.values(added_ids_table, key)))
    print(load_frame)

    for row in etl.data(merged_table):
        for i, col in enumerate(row):
            if isinstance(col, etl.transform.reductions.Conflict):
                changes = tuple(col)
                print('For car {}, {} changed from {} to {}'.format(
                    row[0], source_table_headers[i], changes[1], changes[0]))
                row_dict = dict(zip(source_table_headers, list(row)))
                row_dict[source_table_headers[i]] = changes[0]
                row_dict = {key: [val] for (key, val) in row_dict.items()}
                print(row_dict)
                df = pd.DataFrame(row_dict)
                load_frame = load_frame.append(df, ignore_index=True)
                break

    return etl.fromdataframe(load_frame)
Beispiel #3
0
def test_no_header_mutation():
    data = [
        ['dur', 'description', 'start', 'alpha'],
        [.4, 'test 1', datetime(2000, 1, 1, 15, 15), 0],
        [.7, 'test 1', datetime(2000, 1, 1, 20, 15), 0],
        [1.6, 'test 1', datetime(2000, 1, 20, 15, 15), 0],
        [8.4, 'test 1', datetime(2000, 1, 20, 20, 15), 0],
    ]

    result = group_entries_by_day(data)
    assert set(petl.header(data)) == set(petl.header(result))
Beispiel #4
0
def load(tables_by_id, output_folder, devices):
    for device_id in tables_by_id:
        name = valid_name(devices[device_id]['name'])
        tbl_device_file = path.join(output_folder, f"{name}.csv")
        if path.isfile(tbl_device_file):
            tbl_old = petl.fromcsv(tbl_device_file, delimiter=';')
            old_header = petl.header(tbl_old)
            new_header = petl.header(tables_by_id[device_id])
            if old_header == new_header:
                petl.appendcsv(tables_by_id[device_id], source=tbl_device_file, delimiter=';')
            else:  # TODO: write to the new file
                raise ValueError(f"Incompatible headers:\n old={old_header}\n new={new_header}")
        else:
            petl.tocsv(tables_by_id[device_id], tbl_device_file, delimiter=';')
Beispiel #5
0
def toxlswithheader(table, filename, sheet, **kwargs):
    """
    Use `petl.tocsv` to write CSV data in `table` to file `source`, including
    key-value metadata header if passed in as the keyword argument `metadata`.
    The first row in `table` is assumed to contain the header columns.
    """
    metadata = kwargs.pop("metadata", {})

    # prepare header
    header = petl.header(table)

    # prepare metadata rows using #-prefix, and :-suffix for keys
    metadata_rows = []
    for key, value in metadata.items():
        metadata_row = [''] * len(header)
        metadata_row[0] = '#' + str(key) + ':'
        metadata_row[1] = str(value)
        metadata_rows.append(metadata_row)

    # prepare data (stripped of header)
    data = petl.data(table)

    # combine metadata + header + data the write out
    combined = metadata_rows + [header] + list(data)
    petl.toxls(combined, filename, sheet, **kwargs)
Beispiel #6
0
def aggregate_collection(
    request: HttpRequest,
    collection_id: int,
) -> HttpResponse:
    """
    Value count computations could be also moved into a celery task that
    would prepare the answer for the user and bring it to him later
    (via email or on page with results).
    """
    collection = get_object_or_404(StarWarsCollection, id=collection_id)
    table = etl.fromcsv(collection.filepath)
    aggregate_keys, parameters_settings = parse_parameters(
        request.GET.get(
            'current_parameters',
            '0000001001',
        ), )
    if len(aggregate_keys) == 1:  # aggregate does not work correctly
        # if list with 1 element is passed
        aggregate_keys = aggregate_keys[0]
    if len(aggregate_keys) == 0:  # show no table if every option is disabled
        table = etl.empty()
    else:
        table = table.aggregate(key=aggregate_keys, aggregation=len)
    return render(
        request,
        'main/collection_aggregate.html',
        {
            'collection': collection,
            'parameters_settings': parameters_settings,
            'headers': etl.header(table),
            'data': etl.data(table),
        },
    )
Beispiel #7
0
    def get_context_data(self, **kwargs):
        def url(name, value):
            'return present url query with updated name-value pair'
            qd = self.request.GET.copy()
            if value:
                if isinstance(value, type([])):
                    qd.setlist(name, value)
                else:
                    qd[name] = value
            else:
                del qd[name]
            return qd.urlencode()

        def xor(list1, list2):
            return list(set(list1) ^ set(list2))

        context = super().get_context_data(**kwargs)
        csv = str(settings.MEDIA_ROOT / str(self.object.name)) + '.csv'
        table = etl.fromcsv(csv)
        group = self.request.GET.getlist('group', None)
        if group:
            context['buttons'] = {field: [url('group',xor(group, [field])), field in group] for field in etl.header(table)}
            context['header'] = {field: '' for field in group + ['Count']}
            context['load'] = url('group', None)
            context['rows'] = table.aggregate(key=group[0] if len(group)==1 else group, aggregation=len).records()
        else:
            load = int(self.request.GET.get('load', 10))
            context['header'] = {field: url('group',field) for field in etl.header(table)}
            table = table.head(load + 1)
            if table.len() > load + 1:  # Notice: table header is counted as a row too
                context['load_more'] = url('load', load + 10)
            context['rows'] = table.head(load).records()
        return context
Beispiel #8
0
def test_header():
    """Test the header function."""
    
    table = (('foo', 'bar'), ('a', 1), ('b', 2))
    actual = header(table)
    expect = ('foo', 'bar')
    eq_(expect, actual)
def load_table_data(csv_file, limit) -> TableData:
    table = etl.fromcsv(csv_file)
    return TableData(
        header=etl.header(table),
        data=etl.data(table, limit),
        next_limit=limit + 10 if limit < table.len() else None,
    )
Beispiel #10
0
def tryExtractors(filename, encoding, delimiter_list=[',','\t', ';', '|'], quality_metric= lambda x: len(x)):
	result = []
	for d in delimiter_list:
		csvView = loadCSV(filename, encoding, delimiter=d)
		result.append((quality_metric(etl.header(csvView)), csvView))
	result.sort()
	return cleanFormatTable(result[-1][1])
Beispiel #11
0
 def columns(self):
     """
     `Returns:`
         list
             List of the table's column names
     """
     return list(petl.header(self.table))
Beispiel #12
0
def test_header():
    """Test the header function."""

    table = (('foo', 'bar'), ('a', 1), ('b', 2))
    actual = header(table)
    expect = ('foo', 'bar')
    eq_(expect, actual)
Beispiel #13
0
def load_more(filepath, page, total_items):
    length = 10
    offset = (page - 1) * length

    table = etl.fromcsv(filepath)
    rows = etl.rowslice(table, offset, offset + length)
    next_page = page + 1 if offset + length < total_items else None
    return etl.header(table), list(rows.dicts()), next_page
Beispiel #14
0
def validate(cursor, table, constraints, task_name):
    header = etl.header(table)
    problems = etl.validate(table, constraints=constraints, header=header)
    problems = etl.addfield(problems, 'task_name', task_name)
    problems = etl.addfield(problems, 'create_date', datetime.now())

    # etl.todb(problems, cursor, 'etl_logs')
    etl.appenddb(problems, cursor, 'tetl_logs')
Beispiel #15
0
def precip_table_etl_noaa(
    precip_table,
    rainfall_adjustment=1,
    frequency_min=1,
    frequency_max=1000,
    conversion_factor=2.54,
    desc_field="by duration for ARI (years):",
    duration_val="24-hr:"
    ):
    """
    Extract, Transform, and Load data from a NOAA PRECIPITATION FREQUENCY
    ESTIMATES matrix (in a csv) into an array used by the runoff calculator.
    
    Required Inputs:
        - precip_table: NOAA PRECIPITATION FREQUENCY ESTIMATES csv, in inches.
    Optional Inputs:
        - rainfall_adjustment: multipler to adjust for future rainfall
            conditions. defaults to 1.
        - frequency_min: the min. annual frequency to be returned. Default: 1
        - frequency_max: the max. annual frequency to be returned. Default: 1000
        - conversion_factor: apply to rainfall values. Default: 2.54
            (convert inches to centimeters).
        - desc_field: exact field name from NOAA table in first column.
            Defaults to "by duration for ARI (years):". Used for selecting
            data.
        - duration_val: exact row value in the desc_field from NOAA table that
            contains the duration of interest. Defaults to "24-hr:". Used for
            selecting data.
    Outputs:
        - precip_array: 1D array containing 24-hour duration estimate for
        frequencies 1,2,5,10,25,50,100,200,500,and 1000 year storm events
    """
    # load the csv table, skip the file header information, extract rows we need
    t1 = etl\
        .fromcsv(precip_table)\
        .skip(13)\
        .rowslice(0,19)
    # grab raw data from the row containing the x-hour duration event info
    t2 = etl\
        .select(t1, desc_field, lambda v: v == duration_val)\
        .cutout(desc_field)
    # generate a new header with only columns within frequency min/max
    h = tuple(
        i
        for i in list(etl.header(t2))
        if (int(i) >= frequency_min and int(i) <= frequency_max)
    )

    # for events within freq range, convert to cm, adjust for future rainfall
    t3 = etl\
        .cut(t2, h)\
        .convertall(lambda v: round(float(v) * conversion_factor * rainfall_adjustment, 2))
    # convert to a 1D array (values cast to floats)
    precips = list(etl.data(t3)[0])
    # also convert to a dictionary, for lookup by event
    precips_lookup = list(etl.dicts(t3))[0]
    # return 1D array and dictionary
    return precips, precips_lookup
def load_grouped_data(csv_file, fields) -> TableData:
    table = etl.fromcsv(csv_file)
    if len(fields) == 1:
        fields = fields[0]
    return TableData(
        header=etl.header(table),
        data=etl.aggregate(table, key=fields, aggregation=len),
        next_limit=None,
    )
Beispiel #17
0
    def get_context_data(self, **kwargs) -> dict:
        c = super().get_context_data(**kwargs)
        table = self.object.get_table()
        buttons = etl.header(table)
        offset = int(self.request.GET.get('offset', 1))
        offset_to = offset * self.page_size

        if aggregation_keys := tuple(
                set(buttons).intersection(set(self.request.GET.keys()))):
            table = self.object.get_aggregate_data(aggregation_keys)
Beispiel #18
0
def create_config(csvfile,config_name):
    '''
        Creates a configuration file from a CSV file
    '''
    print csvfile
    var = ''
    try: 
        open(config_name+".ini")
        var = raw_input("This file already exists. Do you wish to continue? (Yes/No) ")
    except:
        pass
        

    if var == 'Yes':
        cfgfile = open(config_name+".ini", "w")
        examplefile = open(config_name+".example", "w")
    else:
        print "goodbye"
        sys.exit()
        

    c = fromcsv(csvfile)
    columns = header(c)
    it = iterdata(c)
    print it.next()
    examplefile.write(str(see(rowslice(c,2,3))))
    examplefile.close()


    # add the settings to the structure of the file, and lets write it out...
    Config = ConfigParser.ConfigParser()
    # dont' change  names to lower case
    Config.optionxform = str
    Config.add_section('FieldTypes')
    Config.add_section('FieldMap')
    for name in columns:
        #Config.set('FieldTypes',c)
        #print name
        new = name
        new = new.split("(", 1)[0].strip()
        # Connect words with underscore
        new = new.replace("/","_")
        new = new.replace(" ","_")
        new = new.replace("-","_")
        new = new.lower()
        # try to guess some of the names
        if "amount" in name: 
            print name
            Config.set('FieldMap',name, new + " FLOAT")
        else:
            print name
            Config.set('FieldMap',name, new + " VARCHAR(10)")
        
    Config.write(cfgfile)
    cfgfile.close()
Beispiel #19
0
 def load_to_warehouse(self, db_info):
     connection = pymysql.connect(
         host=db_info['host'],
         user=db_info['user'],
         password=db_info['passwd'],
         db=db_info['db'],
     )
     connection.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
     for table in self.UPDATE_ORDER:
         data = self.etl_table.TABLES[table]()
         print(f'Loading {table}...\n{data}')
         columns = ','.join(etl.header(data))
         values = ','.join(['%s'] * len(etl.header(data)))
         duplicate_updates = ','.join([
             f'{column} = VALUES({column})' for column in etl.header(data)
         ])
         query = f"INSERT {table} ({columns}) VALUES ({values}) ON DUPLICATE KEY UPDATE {duplicate_updates};"
         print(query)
         connection.cursor().executemany(query, etl.records(data))
     connection.close()
Beispiel #20
0
    def store_to_db(self, conn, tablename, data):
        try:
            if etl.nrows(data) == 0:
                return None
        except TypeError:
            return None

        cursor = conn.cursor()
        sql = "INSERT INTO %s (%s) " % (tablename, ','.join(
            etl.header(data))) + "VALUES %s"
        execute_values(cursor, sql, etl.data(data))
        conn.commit()
        conn.close()
def produce_weather_data(count):
    result = []
    output_data = petl.fromcsv(config.weather_data_output_path, delimiter='|')
    header = petl.header(output_data)
    result.insert(0, header)
    for key, group in petl.rowgroupby(output_data, key='Position'):
        random_index = random.randint(0, config.available_records - 1)
        group_list = list(group)
        result.append(tuple(group_list[random_index]))
    result = result[0:count]
    merged_output = ['|'.join(data) for data in result]
    merged_output = '\n'.join(merged_output)
    return merged_output
Beispiel #22
0
def collection(request: HttpRequest, pk: int) -> HttpResponse:
    limit = int(request.GET.get('limit', DEFAULT_LIMIT))
    columns: List[str] = request.GET.getlist('columns')

    collection: CollectionModel = get_object_or_404(CollectionModel, pk=pk)
    table: CSVView = etl.fromcsv(f'{settings.DATA_PATH}/{collection.filename}')

    if columns:
        table: CutOutView = table.valuecounts(*columns).cutout('frequency')
        headers: tuple = etl.header(table)
        rows: DataView = table.data()
        limit = None
    else:
        rows: DataView = table.data(0, limit)
        headers: tuple = etl.header(table)
        limit = limit + DEFAULT_LIMIT

    return render(
        request, 'collection.html', {
            'collection': collection,
            'headers': headers,
            'rows': rows,
            'limit': limit,
        })
Beispiel #23
0
def group_entries_by_day(inp):
    hdr = petl.header(inp)

    agg = OrderedDict()
    for field in hdr:
        # using first found value
        agg[field] = field, next

    agg['dur'] = 'dur', lambda durs: sum(durs, timedelta())
    agg['start'] = 'start', min

    with_day = petl.addfield(inp, 'start_date',
                             lambda row: row.get('start').date())
    index_keys = ('start_date', 'description')
    result = petl.aggregate(with_day, index_keys, agg)
    return petl.cutout(result, 'start_date')
Beispiel #24
0
def test_basics():

    t1 = (('foo', 'bar'), ('A', 1), ('B', 2))
    w1 = etl.wrap(t1)

    eq_(('foo', 'bar'), w1.header())
    eq_(etl.header(w1), w1.header())
    ieq((('A', 1), ('B', 2)), w1.data())
    ieq(etl.data(w1), w1.data())

    w2 = w1.cut('bar', 'foo')
    expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B'))
    ieq(expect2, w2)
    ieq(etl.cut(w1, 'bar', 'foo'), w2)

    w3 = w1.cut('bar', 'foo').cut('foo', 'bar')
    ieq(t1, w3)
Beispiel #25
0
def test_basics():

    t1 = (('foo', 'bar'), ('A', 1), ('B', 2))
    w1 = FluentWrapper(t1)

    eq_(('foo', 'bar'), w1.header())
    eq_(petl.header(w1), w1.header())
    iassertequal((('A', 1), ('B', 2)), w1.data())
    iassertequal(petl.data(w1), w1.data())

    w2 = w1.cut('bar', 'foo')
    expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B'))
    iassertequal(expect2, w2)
    iassertequal(petl.cut(w1, 'bar', 'foo'), w2)

    w3 = w1.cut('bar', 'foo').cut('foo', 'bar')
    iassertequal(t1, w3)
Beispiel #26
0
    def get_context_data(self, **kwargs):
        context = super().get_context_data(**kwargs)
        context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1]
        context["columns_query_kwarg"] = self.columns_query_kwarg
        table = petl.fromcsv(self.object.downloaded_file)

        full_table_header = list(petl.header(table))
        context["column_options"] = full_table_header

        selected_columns = [c for c in self.request.GET.getlist(self.columns_query_kwarg) if c in full_table_header]
        context["selected_columns"] = selected_columns

        if selected_columns:
            context["header"] = selected_columns + ["Count"]
            context["rows"] = petl.records(
                petl.aggregate(table, selected_columns[0] if len(selected_columns) == 1 else selected_columns, len)
            )

        return context
Beispiel #27
0
def load(request):
	filename = request.GET.get('name','')
	fullpath = settings.DATA_DIR+filename
	"""
	detector = UniversalDetector()
	file_open = open(fullpath)
	for line in file_open.readlines():
		detector.feed(line)
		if detector.done: break
		detector.close()
	file_open.close()
	"""
	encoding = 'ascii'#detector.result['encoding']

	response_data = {}
	a = tryExtractors(fullpath, encoding)
	response_data['result'] = [row for row in etl.head(a)]
	response_data['headers'] = etl.header(a)
	typeInference(a)
	return HttpResponse(json.dumps(response_data), content_type="application/json")
Beispiel #28
0
    def validate_input(self, input):
        input_header = petl.header(input)

        assert self.input_fields.org_name in input_header, (
            f'Column "{self.input_fields.org_name}" not in input {input_header}'
        )
        assert self.input_fields.settlement in set(input_header) | {None}, (
            f'Column "{self.input_fields.settlement}" not in input {input_header}'
        )
        assert self.input_fields.date in set(input_header) | {None}, (
            f'Column "{self.input_fields.date}" not in input {input_header}')

        # output fields must not exist
        new_fields = {
            field_name(f, i)
            for f in self.output_fields.as_set
            for i in range(self.extramatches + 1)
        }
        assert set(input_header).isdisjoint(new_fields), (
            'Column[s] {} are already in input'.format(
                set(input_header).intersection(new_fields)))
Beispiel #29
0
    def etl(self, *args, **kw):
        table = petl.fromxlsx(self._src_path)

        model = DEPTH_TO_WATER
        self._update_model(model, self._vocab)

        # group table by sys_loc_code
        header = petl.header(table)
        for name, records in petl.rowgroupby(petl.sort(table, 'sys_loc_code'),
                                             'sys_loc_code'):
            records = [dict(zip(header, record)) for record in records]
            record = records[0]
            location_id = self._post_location(record, model)
            thing_id = self._post_thing(record, model, location_id)

            print('---------------')
            print(f'len records {len(records)}')
            # self.add_package(record)
            self.observation.set_records(records)
            self.observation.etl(tids=self._make_tids(thing_id, record),
                                 models=(model, ))
Beispiel #30
0
def test_basics():
    
    t1 = (('foo', 'bar'),
         ('A', 1),
         ('B', 2))
    w1 = FluentWrapper(t1)
    
    eq_(('foo', 'bar'), w1.header())
    eq_(petl.header(w1), w1.header())
    iassertequal((('A', 1), ('B', 2)), w1.data())
    iassertequal(petl.data(w1), w1.data())
    
    w2 = w1.cut('bar', 'foo')
    expect2 = (('bar', 'foo'),
               (1, 'A'),
               (2, 'B'))
    iassertequal(expect2, w2)
    iassertequal(petl.cut(w1, 'bar', 'foo'), w2)
    
    w3 = w1.cut('bar', 'foo').cut('foo', 'bar')
    iassertequal(t1, w3)
Beispiel #31
0
def test_basics():
    
    t1 = (('foo', 'bar'),
         ('A', 1),
         ('B', 2))
    w1 = etl(t1)
    
    eq_(('foo', 'bar'), w1.header())
    eq_(petl.header(w1), w1.header())
    ieq((('A', 1), ('B', 2)), w1.data())
    ieq(petl.data(w1), w1.data())
    
    w2 = w1.cut('bar', 'foo')
    expect2 = (('bar', 'foo'),
               (1, 'A'),
               (2, 'B'))
    ieq(expect2, w2)
    ieq(petl.cut(w1, 'bar', 'foo'), w2)
    
    w3 = w1.cut('bar', 'foo').cut('foo', 'bar')
    ieq(t1, w3)
Beispiel #32
0
class CollectionDetailsView(DetailView):
    template_name = 'collection_details.html'
    queryset = SWPeopleCollection.objects.all()

    page_size = 10

    def get_context_data(self, **kwargs) -> dict:
        c = super().get_context_data(**kwargs)
        table = self.object.get_table()
        buttons = etl.header(table)
        offset = int(self.request.GET.get('offset', 1))
        offset_to = offset * self.page_size

        if aggregation_keys := tuple(
                set(buttons).intersection(set(self.request.GET.keys()))):
            table = self.object.get_aggregate_data(aggregation_keys)

        # this is essentially to speed up rendering as it would be slow in django template
        # putting this in templatetag would be more elegant - extend petl to render html directly into a template
        # would also be nice
        data = ''
        for row in etl.data(table, 0, offset_to):
            data += '<tr><td>' + '</td><td>'.join(row) + '</td></tr>'
        c.update({
            'headers':
            etl.header(table),
            'buttons':
            buttons,
            'data':
            data,
            'offset':
            offset + 1,
            'offset_extra_params':
            '&'.join(['{}=on'.format(i) for i in aggregation_keys]),
            'offset_reached':
            table.len() < offset_to,
            'aggregation_keys':
            aggregation_keys
        })
        return c
Beispiel #33
0
def records_for_update(source, existing, update_keys, key='id', source_key=None, existing_key=None):
    '''
    Return a petl compatible list of data which represents any source rows whose keys appear in update_keys
    and whose data is different from the corresponding row in existing.

    :param source: A petl table of source data
    :param existing: A petl table of existing data
    :param update_keys: A list of keys prefiltered to include only those keys that _could_ be source_update_candidates
    :param key: The name of the primary key field
    '''
    if source_key is None and existing_key is None:
        source_key = existing_key = key

    source_update_candidates = petl.transform.select(source, lambda rec: rec[source_key] in update_keys).lookup(source_key)
    existing_update_candidates = petl.transform.select(existing, lambda rec: rec[existing_key] in update_keys).lookup(existing_key)

    to_update = [petl.header(source)]
    for k, source_rec in source_update_candidates.items():
        existing_rec = existing_update_candidates[k]
        if will_change(source_rec, existing_rec):
            to_update.append(source_rec)
    return to_update
Beispiel #34
0
    def get_context_data(self, **kwargs):
        context = super().get_context_data(**kwargs)
        context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1]
        context["count_query_kwarg"] = self.count_query_kwarg

        table = petl.fromcsv(self.object.downloaded_file)
        context["header"] = petl.header(table)

        try:
            record_count_to_show = int(self.request.GET.get(self.count_query_kwarg))
        except (TypeError, ValueError):
            record_count_to_show = self.count_increment

        # Potentially expensive, cache / save in database for dataset
        if petl.nrows(table) > record_count_to_show:
            context[
                "load_more_url"
            ] = f"{self.request.path}?{self.count_query_kwarg}={record_count_to_show+self.count_increment}"

        context["rows"] = petl.records(petl.head(table, record_count_to_show))

        return context
Beispiel #35
0
def make_sqlalchemy_table(table, tablename, schema=None, constraints=True, metadata=None):
    """
    Create an SQLAlchemy table based on a :mod:`petl` table.

    Parameters
    ----------

    table : sequence of sequences (petl table)
        Table data to use to infer types etc.
    tablename : string
        Name of the table
    schema : string
        Name of the database schema to create the table in
    constraints : bool
        If True use length and nullable constraints
    metadata : sqlalchemy.MetaData
        Custom table metadata

    """

    try:
        import sqlalchemy
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    if not metadata:
        metadata = sqlalchemy.MetaData()

    sql_table = sqlalchemy.Table(tablename, metadata, schema=schema)

    fields = header(table)
    cols = columns(table)

    for f in fields:
        sql_column = make_sqlalchemy_column(cols[f], f, constraints=constraints)
        sql_table.append_column(sql_column)

    return sql_table
Beispiel #36
0
def make_sqlalchemy_table(table, tablename, schema=None, constraints=True, metadata=None):
    """
    Create an SQLAlchemy table based on a :mod:`petl` table.

    Parameters
    ----------

    table : sequence of sequences (petl table)
        Table data to use to infer types etc.
    tablename : string
        Name of the table
    schema : string
        Name of the database schema to create the table in
    constraints : bool
        If True use length and nullable constraints
    metadata : sqlalchemy.MetaData
        Custom table metadata

    """

    try:
        import sqlalchemy
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    if not metadata:
        metadata = sqlalchemy.MetaData()

    sql_table = sqlalchemy.Table(tablename, metadata, schema=schema)

    fields = header(table)
    cols = columns(table)

    for f in fields:
        sql_column = make_sqlalchemy_column(cols[f], f, constraints=constraints)
        sql_table.append_column(sql_column)

    return sql_table
Beispiel #37
0
def attendance_file2table(filename, output_csv_filebase, add_extra_fields):
    global full_name2sk_indiv_id

    print "*** Parsing file: " + filename
    print

    attendance_dicts = []

    # CCB's Worship Service event IDs...
    event_ids = {}
    event_ids["8"] = 6
    event_ids["9"] = 7
    event_ids["10"] = 8
    event_ids["11:15"] = 9
    event_ids["Christmas"] = 13

    # The following are used to create CSV output filenames and to emit human-readable event name if add_extra_fields
    # flag is on
    event_names = {}
    event_names[6] = "08am"
    event_names[7] = "09am"
    event_names[8] = "10am"
    event_names[9] = "11_15am"
    event_names[13] = "Christmas Eve"

    # Time of event in Excel-parseable format
    event_times = {}
    event_times[6] = "08:00 AM"
    event_times[7] = "09:00 AM"
    event_times[8] = "10:00 AM"
    event_times[9] = "11:15 AM"
    event_times[13] = "04:00 PM"

    # Starting state...
    prior_line = None
    matched_month_year = None
    matched_service_time = None
    month = None
    year = None
    service_time = None
    line_number = 1
    total_row_dict = None
    event_id = None
    accumulated_row_totals_dict = {"week1": 0, "week2": 0, "week3": 0, "week4": 0, "week5": 0, "week6": 0, "total": 0}
    full_name = None
    phone = None
    num_processed_lines = 0

    for line in open(filename):

        # First pick off line at front of file indicating month and year that this attendance file is for...
        if not matched_month_year:
            matched_month_year = re.search("For the month of ([A-Z][a-z]+), ([0-9]{4})", line)
            if matched_month_year:
                month = string2monthnum(matched_month_year.group(1))
                year = string2yearnum(matched_month_year.group(2))
                if not (month and year):
                    print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                    print >> sys.stderr, "*** ERROR! Invalid month or year found"
                    print >> sys.stderr, line
                    print >> sys.stderr
                    sys.exit(1)
                first_day_in_month, num_days_in_month = calendar.monthrange(year, month)

                # Create list of 6 date objects, month_sundays, representing week1, week2, ... week6 Sunday dates
                # If a week has no Sunday, it is None
                day_countup = 1
                day_countup += 6 - first_day_in_month
                month_sundays = []
                if first_day_in_month != 6:
                    month_sundays.append(None)
                while day_countup <= num_days_in_month:
                    month_sundays.append(datetime.date(year, month, day_countup))
                    day_countup += 7
                while len(month_sundays) < 6:
                    month_sundays.append(None)
                christmas_eve_date = datetime.date(year, month, 24)

        # Second pick off line at front of file indicating worship service time that this attendance file is for...
        elif not matched_service_time:
            matched_service_time = re.search("Worship Service - (Sunday |Summer )?([^ ]*)", line)
            if matched_service_time:
                service_time = matched_service_time.group(2)
                if service_time in event_ids:
                    event_id = event_ids[service_time]
                    event_name = event_names[event_id]
                else:
                    print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                    print >> sys.stderr, '*** ERROR! Unrecognized service_time: "' + service_time + '"'
                    print >> sys.stderr
                    sys.exit(1)

        # ...then match attendance (row per person with weeks they attended) and total (summary at bottom) rows
        else:

            # Once we found row with totals...we're done, that's last line in attendance file we need to parse
            matched_total_line = re.search("^ {18}Total: {13}(?P<attendance>( +[0-9]+)+)\r?$", line)
            if matched_total_line:
                totals_attendance_dict = attendance_str2dict(
                    matched_total_line.group("attendance"), [-3, -9, -15, -20, -24, -29, -35], 3
                )
                break

            matched_attendance_line = re.search(
                "^ {6}"
                + "(?P<full_name>(?P<last_name>[A-Za-z]+([ \-'][A-Za-z]+)*), "
                + "(?P<first_name>([A-Za-z]+\.?)+([\-' ][A-Za-z]+)*)( \((?P<nick_name>[A-Za-z]+)\))?\.?)?\r?"
                + "(?P<phone>( +)?([0-9]{3}-[0-9]{3}-[0-9]{4}|Unlisted))?"
                + "(?P<attendance> +(1 +)+[1-6])?\r?$",
                line,
            )
            if matched_attendance_line:
                if matched_attendance_line.group("full_name"):
                    full_name = matched_attendance_line.group("full_name").strip()
                if matched_attendance_line.group("phone"):
                    phone = matched_attendance_line.group("phone").strip()
                if matched_attendance_line.group("attendance"):
                    if full_name:
                        attendance = matched_attendance_line.group("attendance").strip()
                        row_dict = attendance_str2dict(attendance, [-1, -7, -13, -18, -22, -27, -33], 1)
                        row_dict["full_name"] = full_name
                        if phone:
                            row_dict["phone"] = phone
                        else:
                            row_dict["phone"] = ""
                        num_processed_lines += 1
                        full_name = None
                        phone = None
                        if row_dict["total"] != (
                            row_dict["week1"]
                            + row_dict["week2"]
                            + row_dict["week3"]
                            + row_dict["week4"]
                            + row_dict["week5"]
                            + row_dict["week6"]
                        ):
                            print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                            print >> sys.stderr, "*** ERROR! Bad row total, doesn't match sum of weeks 1-6"
                            print >> sys.stderr, row_dict
                            print >> sys.stderr
                            break

                        for key in accumulated_row_totals_dict:
                            accumulated_row_totals_dict[key] += row_dict[key]
                        attendance_dicts.append(row_dict)

            # Buffer the current line for line folding if needed (see 'line folding' above)
            prior_line = line
            line_number += 1

    print "*** Number of attendance lines processed: " + str(num_processed_lines)
    print "*** Number of attendees: " + str(accumulated_row_totals_dict["total"])
    print

    if output_csv_filebase and event_id:
        output_csv_filename = (
            output_csv_filebase + "/" + str(year) + format(month, "02d") + "_" + str(event_names[event_id]) + ".csv"
        )
        all_columns_table = petl.fromdicts(attendance_dicts)
        petl.tocsv(all_columns_table, output_csv_filename)

    # Build 2nd list of dicts, where each list item is dict of individual date/event attendance.  I.e. a row per
    # worship service date vs original attendance dicts format of a row per attendee across all weeks in month.
    # This is the actual one returned and eventually emitted into output file
    attendance_dicts2 = []
    for attendance_dict in attendance_dicts:
        for key in attendance_dict:
            if key[:4] == "week" and attendance_dict[key] != 0:
                week_index = int(key[4:5]) - 1
                if month_sundays[week_index] is not None:
                    attendance_dict2 = {}
                    full_name = attendance_dict["full_name"]
                    if full_name in full_name2sk_indiv_id:
                        attendance_dict2["Individual ID"] = full_name2sk_indiv_id[full_name]
                        if event_name == "Christmas Eve":
                            attendance_dict2["Date"] = christmas_eve_date
                        else:
                            attendance_dict2["Date"] = month_sundays[week_index]
                        attendance_dict2["Event ID"] = event_id
                        if add_extra_fields:
                            attendance_dict2["Time"] = event_times[event_id]
                            attendance_dict2["Full Name"] = full_name
                            attendance_dict2["Event Name"] = event_name
                            attendance_dict2["Week Num"] = week_index + 1
                        attendance_dicts2.append(attendance_dict2)
                    else:
                        print >> sys.stderr, '*** WARNING! Cannot find "' + full_name + '" in map'
                        print >> sys.stderr
                else:
                    print >> sys.stderr, '*** WARNING! Cannot find Sunday date for week index "' + str(week_index) + '"'
                    print >> sys.stderr

    # Check if numbers on Servant Keeper's reported Total: line match the totals we've been accumulating
    # per attendance row entry.  If they don't match, show WARNING (not ERROR, since via manual checks, it appears
    # that Servant Keeper totals are buggy)
    if totals_attendance_dict:
        for key in accumulated_row_totals_dict:
            if accumulated_row_totals_dict[key] != totals_attendance_dict[key]:
                pp = pprint.PrettyPrinter(stream=sys.stderr)
                print >> sys.stderr, "*** WARNING! Servant Keeper reported totals do not match data totals"
                print >> sys.stderr, "Servant Keeper Totals:"
                pp.pprint(totals_attendance_dict)
                print >> sys.stderr, "Data Totals:"
                pp.pprint(accumulated_row_totals_dict)
                print >> sys.stderr
                break

    return_table = petl.fromdicts(attendance_dicts2)
    header = petl.header(return_table)
    if "Event Name" in header:
        return_table = petl.cut(
            return_table, "Full Name", "Event Name", "Time", "Week Num", "Date", "Event ID", "Individual ID"
        )
    else:
        return_table = petl.cut(return_table, "Date", "Event ID", "Individual ID")

    return return_table
Beispiel #38
0
# csv = comma delimited, tsv = tab delimited
pre_etl_time = time.time()
a = etl.fromtsv('snpdata.csv')
post_etl_time = time.time()
b = etl.fromtsv('popdata.csv')

pre_df_time = time.time()
df_a = pd.read_csv('snpdata.csv', sep='\t', header=0)
post_df_time = time.time()

print("ETL time to load A file: {} Pandas time to load A file: {}".format(
    post_etl_time - pre_etl_time, post_df_time - pre_df_time))

df_b = pd.read_csv('popdata.csv', sep='\t', header=0)

header_a = etl.header(a)
header_b = etl.header(b)
if _DEBUG:
    print(header_a)
    print(header_b)

b_renamed = b.rename({
    'Chromosome': 'Chr',
    'Coordinates': 'Pos',
    'Ref. Allele': 'Ref',
    'Non-Ref. Allele': 'Nref',
    'Derived Allele': 'Der',
    'Mutation type': 'Mut',
    'Gene': 'GeneId',
    'Gene Aliases': 'GeneAlias',
    'Gene Description': 'GeneDescr'
Beispiel #39
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize")
    parser.add_argument("--sep-columns", required=False, nargs = '*', default=argparse.SUPPRESS,
        help="Column names of columns containing comma- or semi-colon-separated values")
    parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \
        "fields.  Defaults to ';' if not specified.")
    parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS,
        help="Column names to NOT generate stats for")
    parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number "
        "of header rows")
    parser.add_argument("--first-ccb-column", required=False, help="String name of first CCB column.  If "
        "specified, all preceeding columns will be labeled 'Servant Keeper' and this column "
        "and all subsequent will be labeled 'CCB'")
    args = parser.parse_args()

    if args.first_ccb_column is not None:
        column_prefix = 'Servant Keeper '
    else:
        column_prefix = ''

    assert os.path.isfile(args.input_csv_filename), "Error: cannot open file '" + args.input_csv_filename + "'"

    table = petl.fromcsv(args.input_csv_filename)

    # Skip header rows
    if args.skip_num_rows:
        skip_num = args.skip_num_rows
        assert skip_num > 0, "--skip-num-rows value '" + str(skip_num) + "' is invalid.  Must be positive."
        it = iter(table)
        while skip_num >= 0:
            row = next(it)
            skip_num -= 1
        table = petl.setheader(table, row)
        table = petl.tail(table, petl.nrows(table) - args.skip_num_rows)

    # Print nicely formatted stats for each column
    sep = ''
    args_dict = vars(args)
    skip_columns_specified = 'skip_columns' in args_dict
    sep_char_specified = 'sep_character' in args_dict
    for column in petl.header(table):
        if args.first_ccb_column is not None and column == args.first_ccb_column:
            column_prefix = 'CCB '
        if not skip_columns_specified or column not in args.skip_columns:
            output_str = column_prefix + "Column '" + column + "'"
            print sep + output_str
            print >> sys.stderr, output_str
            if args.sep_columns is not None and column in args.sep_columns:
                if sep_char_specified:
                    sep_character = args.sep_character
                else:
                    sep_character = ';'
                output_str = num_dict2str(dict_dump(sep_valuecounter(table, column, sep_character)))
                print output_str
            else:
                output_str = num_dict2str(dict_dump(valuecounts(table, column)))
                print output_str
        sep = '\n'

    # Flush to ensure all output is written
    sys.stdout.flush()
    sys.stderr.flush()
Beispiel #40
0
def test_header():
    table = (('foo', 'bar'), ('a', 1), ('b', 2))
    actual = header(table)
    expect = ('foo', 'bar')
    eq_(expect, actual)
Beispiel #41
0
list(bar)
# values from multiple fields
table2 = [["foo", "bar", "baz"], [1, "a", True], [2, "bb", True], [3, "d", False]]
foobaz = etl.values(table2, "foo", "baz")
foobaz
list(foobaz)


# header()
##########


import petl as etl

table = [["foo", "bar"], ["a", 1], ["b", 2]]
etl.header(table)


# fieldnames()
##############

import petl as etl

table = [["foo", "bar"], ["a", 1], ["b", 2]]
etl.fieldnames(table)
etl.header(table)


# data()
########
Beispiel #42
0
def cleanFormatTable(table):
	newtable = table
	for h in etl.header(table):
		newtable = etl.convert(table, h, sanitize)
	return newtable
Beispiel #43
0
def typeInference(table):
	for h in etl.header(table):
		col =  etl.cut(table, h)
		print etl.nrows(col)
Beispiel #44
0
def test_header():
    table = (("foo", "bar"), ("a", 1), ("b", 2))
    actual = header(table)
    expect = ("foo", "bar")
    eq_(expect, actual)