コード例 #1
0
}, {
    'attr1': 25
}, {
    'attr1': None
}, {
    'attr1': 5
}, {
    'attr1': None
}, {
    'attr1': None
}]

wrapper = pygrametl.ConnectionWrapper(connection=null_conn)

dim1 = Dimension(
    name='dim1',
    key='key1',
    attributes=['attr1'],
)

for row in data:
    dim1.insert(row)

dim_rep = DimRepresentation(dim1, null_conn)
notnull_tester = ColumnNotNullPredicate('dim1')
null_rep = DWRepresentation([dim_rep], null_conn)

print(notnull_tester.run(null_rep))

null_conn.close()
コード例 #2
0
ファイル: etl.py プロジェクト: onemoreuselessuser/pygram-test
    # Finally each part is reassigned to the row dictionary. It can then be
    # accessed by the caller as the row is a reference to the dict object
    row['year'] = timestamp_split[0]
    row['month'] = timestamp_split[1]
    row['day'] = timestamp_split[2]


# The Location dimension is filled with data from the CSV file, as the file
# contains information for both columns in the table. If the dimension was
# filled using the sales database, it would be necessary to update the
# region attribute with data from the CSV file later irregardless.
# To perform the insertion, the method Dimension.insert() is used which
# inserts a row into the table, and the connection wrapper is asked to
# commit to ensure that the data is present in the database to allow for
# lookups of keys for the fact table
[location_dimension.insert(row) for row in region_source]

# The file handle for the CSV file can then be closed
region_file_handle.close()

# As all the information needed for the other dimensions are stored in the
# sales database, we can loop through all the rows in it, split the timestamp
# into its three parts, and lookup the three dimension keys needed for
# the fact table while letting pygrametl update each dimension with the
# necessary data using Dimension.ensure(). Using this method instead of
# insert combines a lookup with a insertion so a new row only is inserted
# into the dimension or fact table, if it does not yet exist.
for row in sales_source:

    # The timestamp is split into its three parts
    split_timestamp(row)
コード例 #3
0
ファイル: dw.py プロジェクト: Betaboxguugi/P6
def setup(dbname, number):

    open(os.path.expanduser(dbname), 'w')

    output_conn = sqlite3.connect(dbname)
    output_cur = output_conn.cursor()

    output_cur.execute("CREATE TABLE dim1 "
                       "(key1 INTEGER PRIMARY KEY, attr1 INTEGER, "
                       "attr2 INTEGER)")

    output_cur.execute("CREATE TABLE dim2 "
                       "(key2 INTEGER PRIMARY KEY, attr3 INTEGER, "
                       "attr4 INTEGER)")

    output_cur.execute("CREATE TABLE ft1 "
                       "(measure INTEGER, key1 INTEGER, key2 INTEGER, "
                       "PRIMARY KEY(key1, key2))")

    output_conn.commit()
    output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)

    dim1 = Dimension(
        name='dim1',
        key='key1',
        attributes=['attr1', 'attr2']
    )

    dim2 = Dimension(
        name='dim2',
        key='key2',
        attributes=['attr3', 'attr4'],
    )

    ft1 = FactTable(
        name='ft1',
        keyrefs=['key1', 'key2'],
        measures=['measure']
    )

    facts = []
    dim1_data = []
    dim2_data = []

    total_elapsed = 0.
    print('Generating ft1 data')
    start = time.monotonic()
    counter = 0
    for i in range(1, number + 1):
        for j in range(1, number + 1):
            counter += 10
            facts.append({'key1': i, 'key2': j, 'measure': counter})
    end = time.monotonic()
    elapsed = end - start
    print('Generated: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Generating dim1 data')
    start = time.monotonic()
    for i in range(1, number + 1):
        dim1_data.append({'attr1': i, 'attr2': number + 1 - i})
    end = time.monotonic()
    elapsed = end - start
    print('Generated: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Generating dim2 data')
    start = time.monotonic()
    for i in range(1, number + 1):
        dim2_data.append({'attr3': i, 'attr4': number + 1 - i})
    end = time.monotonic()
    elapsed = end - start
    print('Generated: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Inserting data into ft1')
    start = time.monotonic()
    for row in facts:
        ft1.insert(row)
    end = time.monotonic()
    elapsed = end - start
    print('Inserted: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Inserting data into dim1')
    start = time.monotonic()
    for row in dim1_data:
        dim1.insert(row)
    end = time.monotonic()
    elapsed = end - start
    print('Inserted: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('Inserting data into dim2')
    start = time.monotonic()
    for row in dim2_data:
        dim2.insert(row)
    end = time.monotonic()
    elapsed = end - start
    print('Inserted: {}{}'.format(round(elapsed, 3), 's'))
    total_elapsed += elapsed

    print('DW populated')
    print('Total time: {}{}\n'.format(round(total_elapsed, 3), 's'))
    output_conn.commit()
コード例 #4
0
    {'attr1': None},
    {'attr1': 15},
    {'attr1': None},
    {'attr1': None},
    {'attr1': 1},
    {'attr1': 25},
    {'attr1': None},
    {'attr1': 5},
    {'attr1': None},
    {'attr1': None}
]

wrapper = pygrametl.ConnectionWrapper(connection=null_conn)

dim1 = Dimension(
    name='dim1',
    key='key1',
    attributes=['attr1'],
)

for row in data:
    dim1.insert(row)

dim_rep = DimRepresentation(dim1, null_conn)
notnull_tester = ColumnNotNullPredicate('dim1')
null_rep = DWRepresentation([dim_rep], null_conn)

print(notnull_tester.run(null_rep))

null_conn.close()
コード例 #5
0
    attributes=['city', 'region'],
    lookupatts=['city'])

fact_table = FactTable(
    name='factTable',
    keyrefs=['bookid', 'locationid', 'timeid'],
    measures=['sale'])


# NOTE: Most of the following code is taken directly form pygrametl.org and has little to do with
# making this example portable to SQLite and this particular implementation. It is however down here
# that a lot of the cool stuff happens, i.e. the ETL stuff.
# Python function needed to split the timestamp into its three parts

# We put the regions into our location_dimension
[location_dimension.insert(row) for row in region_source]
csv_file_handle.close()

# Here are all of our steps.
# They make the necessary transformations so that we can load our data into the DW
# Each row in the source is processed through the step chain one at a time.

# Beginning of chain for iterating over rows
step_starter = SourceStep(sales_source)

# Calls the time_splitter function on a row
time_splitter = Step(split_timestamp)

# These steps ensure that an entry related to our row is in the given dimension. If not it is inserted.
# Afterwards the primary key of the given entry is inserted into the row.
ensure_book = DimensionStep(dimension=book_dimension, keyfield='bookid')