def lookup_and_transform(ts_kv_table): """The table has the following structure: +---------------------------------+---------------+---------------+--------+ | entity_id | key | ts | value | +=================================+===============+===============+========+ | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | -1.8 | +---------------------------------+---------------+---------------+--------+ | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | 227 | +---------------------------------+---------------+---------------+--------+ The output is a dictionary {device_id:table} of tables like that: +--------------+--------------+---------------+ | ts | Temperature | WindDirection | +--------------+--------------+---------------+ |1583010011665 | -1.8 | 230 | +--------------+--------------+---------------+ |1583010000692 | -2.5 | 227 | +--------------+--------------+---------------+ """ lkp = petl.lookup(ts_kv_table, 'entity_id', value=('key', 'ts', 'value')) for entity_id in lkp: tbl = [('key', 'ts', 'value')] + lkp[entity_id] tbl = petl.recast(tbl, variablefield='key', valuefield='value') cut_keys = KEYS_TO_REMOVE & set(petl.fieldnames(tbl)) tbl = petl.cutout(tbl, *cut_keys) tbl = petl.transform.headers.sortheader(tbl) tbl = petl.transform.basics.movefield(tbl, 'ts', 0) lkp[entity_id] = petl.sort(tbl, 'ts') return lkp
def test_fieldnames(): table = (('foo', 'bar'), ('a', 1), ('b', 2)) actual = fieldnames(table) expect = ['foo', 'bar'] eq_(expect, actual) class CustomField(object): def __init__(self, key, description): self.key = key self.description = description def __str__(self): return self.key def __repr__(self): return 'CustomField(%r, %r)' % (self.key, self.description) table = ((CustomField('foo', 'Get some foo.'), CustomField('bar', 'A lot of bar.')), ('a', 1), ('b', 2)) actual = fieldnames(table) expect = ['foo', 'bar'] eq_(expect, actual)
def test_fieldnames(): table = (("foo", "bar"), ("a", 1), ("b", 2)) actual = fieldnames(table) expect = ["foo", "bar"] eq_(expect, actual) class CustomField(object): def __init__(self, key, description): self.key = key self.description = description def __str__(self): return self.key def __repr__(self): return "CustomField(%r, %r)" % (self.key, self.description) table = ((CustomField("foo", "Get some foo."), CustomField("bar", "A lot of bar.")), ("a", 1), ("b", 2)) actual = fieldnames(table) expect = ["foo", "bar"] eq_(expect, actual)
def test_fieldnames(): """Test the fieldnames function.""" table = (('foo', 'bar'), ('a', 1), ('b', 2)) actual = fieldnames(table) expect = ['foo', 'bar'] assertequal(expect, actual) class CustomField(object): def __init__(self, id, description): self.id = id self.description = description def __str__(self): return self.id def __repr__(self): return 'CustomField(%r, %r)' % (self.id, self.description) table = ((CustomField('foo', 'Get some foo.'), CustomField('bar', 'A lot of bar.')), ('a', 1), ('b', 2)) actual = fieldnames(table) expect = ['foo', 'bar'] assertequal(expect, actual)
def split_dataset(dataset, p_train_data, split_mode): fields = list(fieldnames(dataset)) size_dataset = len(values(dataset, fields[0])) size_train_data = int(round(size_dataset * p_train_data)) size_test_data = abs(size_train_data - size_dataset) if split_mode == 'normal' : train_data = head(dataset, size_train_data - 1) if size_test_data == 0: test_data = [] else: test_data = tail(dataset, size_test_data - 1) #################### Falta incluir Shuffle mode ############### return train_data, test_data
foobaz list(foobaz) # header() ########## import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]] etl.header(table) # fieldnames() ############## import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]] etl.fieldnames(table) etl.header(table) # data() ######## import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]] d = etl.data(table) list(d) # dicts() ######### import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]]
# Fuente de los datos que vamos a leer uri_confirmed = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv' uri_death = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv' uri_recovered = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv' # Procesamos primero casos confirmados t_confirmed = etl.fromcsv(uri_confirmed) # Cambiamos el nombre a los encabezados t_confirmed = etl.rename(t_confirmed, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(t_confirmed) i = 0 for header in headers: if i >= 4: t_confirmed = etl.convert(t_confirmed, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime( header, '%m/%d/%y') # calculamos la fecha en formato correcto t_confirmed = etl.rename(t_confirmed, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar t_confirmed = etl.cutout(t_confirmed, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente
########## import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] etl.header(table) # fieldnames() ############## import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] etl.fieldnames(table) etl.header(table) # data() ######## import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] d = etl.data(table) list(d) # dicts() #########
def procesar_fuente(path, nombre): try: # Procesamos primero casos confirmados tabla = etl.fromcsv(path) # Cambiamos el nombre a los encabezados tabla = etl.rename(tabla, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(tabla) i=0 for header in headers: if i>=4: tabla = etl.convert(tabla, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime(header, '%m/%d/%y') # calculamos la fecha en formato correcto tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar tabla = etl.cutout(tabla, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea') tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar') tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA') tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(tabla) df = df_confirmed.groupby(['Country']).sum() tabla = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente tabla = etl.rename(tabla, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas tabla = etl.melt(tabla, 'Country') tabla = etl.rename(tabla, {'variable': 'Date'}) tabla = etl.rename(tabla, {'value': 'Cases'}) # Luego agregamos el continente para agrupar tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. tabla = etl.convert(tabla, 'Cases', int) tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') ) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES') etl.todb(tabla, conn, nombre, create=True, drop=True) conn.close() except: print('Se ha presentado un error! ', sys.exc_info()[0]) raise
def extract_values(dataset, field_position): fields = list(fieldnames(dataset)) field_values = values(dataset, field_position) return field_values
def add_fields(fields, table): for field in fields: if field not in etl.fieldnames(table): table = etl.addfield(table, field, None) return table
def remove_fields(fields, table): for field in fields: if field in etl.fieldnames(table): table = etl.cutout(table, field) return table