Beispiel #1
0
def vcfmeltsamples(table, *samples):
    """
    Melt the samples columns. E.g.::
    
        >>> import petl as etl
        >>> # activate bio extensions
        ... import petlx.bio
        >>> table1 = (
        ...     etl
        ...     .fromvcf('fixture/sample.vcf')
        ...     .vcfmeltsamples()
        ... )
        >>> table1
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | CHROM | POS | ID   | REF | ALT | QUAL | FILTER | INFO | SAMPLE    | CALL                                                |
        +=======+=====+======+=====+=====+======+========+======+===========+=====================================================+
        | '19'  | 111 | None | 'A' | [C] |  9.6 | None   | {}   | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10])) |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | '19'  | 111 | None | 'A' | [C] |  9.6 | None   | {}   | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10])) |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | '19'  | 111 | None | 'A' | [C] |  9.6 | None   | {}   | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3]))   |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | '19'  | 112 | None | 'A' | [G] |   10 | None   | {}   | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10])) |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | '19'  | 112 | None | 'A' | [G] |   10 | None   | {}   | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10])) |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        ...

    """

    result = etl.melt(table, key=VCF_HEADER, variables=samples,
                      variablefield='SAMPLE', valuefield='CALL')
    return result
Beispiel #2
0
def meltsamples(tbl, *samples):
    """
    Melt the samples columns. E.g.::
    
        >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples
        >>> from petl import look, cutout
        >>> t1 = fromvcf('../fixture/sample.vcf')
        >>> t2 = meltsamples(t1)
        >>> t3 = cutout(t2, 'INFO')
        >>> look(t3)
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | 'CHROM' | 'POS' | 'ID'        | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE'  | 'CALL'                                                               |
        +=========+=======+=============+=======+=======+========+==========+===========+======================================================================+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10]))                  |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10]))                  |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3]))                    |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10]))                  |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10]))                  |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3]))                    |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, GQ=48, DP=1, HQ=[51, 51]))     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00002' | Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00003' | Call(sample=NA00003, CallData(GT=1/1, GQ=43, DP=5, HQ=[None, None])) |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '20'    | 17330 | None        | 'T'   | [A]   |      3 | ['q10']  | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, GQ=49, DP=3, HQ=[58, 50]))     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        
    .. versionadded:: 0.5
    
    """
    result = melt(tbl,
                  key=fixed_fields,
                  variables=samples,
                  variablefield='SAMPLE',
                  valuefield='CALL')
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result
Beispiel #3
0
def meltsamples(tbl, *samples):
    """
    Melt the samples columns. E.g.::
    
        >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples
        >>> from petl import look, cutout
        >>> t1 = fromvcf('../fixture/sample.vcf')
        >>> t2 = meltsamples(t1)
        >>> t3 = cutout(t2, 'INFO')
        >>> look(t3)
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | 'CHROM' | 'POS' | 'ID'        | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE'  | 'CALL'                                                               |
        +=========+=======+=============+=======+=======+========+==========+===========+======================================================================+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10]))                  |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10]))                  |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3]))                    |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10]))                  |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10]))                  |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3]))                    |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, GQ=48, DP=1, HQ=[51, 51]))     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00002' | Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00003' | Call(sample=NA00003, CallData(GT=1/1, GQ=43, DP=5, HQ=[None, None])) |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        | '20'    | 17330 | None        | 'T'   | [A]   |      3 | ['q10']  | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, GQ=49, DP=3, HQ=[58, 50]))     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+
        
    .. versionadded:: 0.5
    
    """
    result = melt(tbl, key=fixed_fields, variables=samples, variablefield='SAMPLE', valuefield='CALL')
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result
Beispiel #4
0
def vcfmeltsamples(table, *samples):
    """
    Melt the samples columns. E.g.::
    
        >>> import petl as etl
        >>> # activate bio extensions
        ... import petlx.bio
        >>> table1 = (
        ...     etl
        ...     .fromvcf('fixture/sample.vcf')
        ...     .vcfmeltsamples()
        ... )
        >>> table1
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | CHROM | POS | ID   | REF | ALT | QUAL | FILTER | INFO | SAMPLE    | CALL                                                |
        +=======+=====+======+=====+=====+======+========+======+===========+=====================================================+
        | '19'  | 111 | None | 'A' | [C] |  9.6 | None   | {}   | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10])) |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | '19'  | 111 | None | 'A' | [C] |  9.6 | None   | {}   | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10])) |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | '19'  | 111 | None | 'A' | [C] |  9.6 | None   | {}   | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3]))   |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | '19'  | 112 | None | 'A' | [G] |   10 | None   | {}   | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10])) |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        | '19'  | 112 | None | 'A' | [G] |   10 | None   | {}   | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10])) |
        +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+
        ...

    """

    result = etl.melt(table,
                      key=VCF_HEADER,
                      variables=samples,
                      variablefield='SAMPLE',
                      valuefield='CALL')
    return result
Beispiel #5
0

# melt

table1 = [['id', 'gender', 'age'],
          [1, 'F', 12],
          [2, 'M', 17],
          [3, 'M', 16]]
table3 = [['id', 'time', 'height', 'weight'],
          [1, 11, 66.4, 12.2],
          [2, 16, 53.2, 17.3],
          [3, 12, 34.5, 9.4]]

from petl import melt, look
look(table1)
table2 = melt(table1, 'id')
look(table2)
# compound keys are supported
look(table3)
table4 = melt(table3, key=['id', 'time'])
look(table4)
# a subset of variable fields can be selected
table5 = melt(table3, key=['id', 'time'], variables=['height'])    
look(table5)


# recast

table1 = [['id', 'variable', 'value'],
          [3, 'age', 16],
          [1, 'gender', 'F'],
Beispiel #6
0

# melt

table1 = [['id', 'gender', 'age'],
          [1, 'F', 12],
          [2, 'M', 17],
          [3, 'M', 16]]
table3 = [['id', 'time', 'height', 'weight'],
          [1, 11, 66.4, 12.2],
          [2, 16, 53.2, 17.3],
          [3, 12, 34.5, 9.4]]

from petl import melt, look
look(table1)
table2 = melt(table1, 'id')
look(table2)
# compound keys are supported
look(table3)
table4 = melt(table3, key=['id', 'time'])
look(table4)
# a subset of variable fields can be selected
table5 = melt(table3, key=['id', 'time'], variables=['height'])    
look(table5)


# recast

table1 = [['id', 'variable', 'value'],
          [3, 'age', 16],
          [1, 'gender', 'F'],
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Burma',
                          'Myanmar')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'US', 'USA')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Taiwan*',
                          'Taiwan')

# Luego procedemos a agrupar y acumular los resultados por el país
df_confirmed = etl.todataframe(t_confirmed)
df = df_confirmed.groupby(['Country']).sum()
t_confirmed = etl.fromdataframe(df, include_index=True)

# Renombramos el campo de Country nuevamente
t_confirmed = etl.rename(t_confirmed, {'index': 'Country'})

# Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas
t_confirmed = etl.melt(t_confirmed, 'Country')
t_confirmed = etl.rename(t_confirmed, {'variable': 'Date'})
t_confirmed = etl.rename(t_confirmed, {'value': 'Cases'})

# Luego agregamos el continente para agrupar
t_confirmed = etl.addfield(t_confirmed, 'Continent',
                           lambda rec: get_continent_code(rec['Country']))

# Y nuevamente nos aseguramos que sean del tipo de dato que deben ser.
t_confirmed = etl.convert(t_confirmed, 'Cases', int)
t_confirmed = etl.convert(t_confirmed, 'Date',
                          lambda v: datetime.datetime.strptime(v, '%Y-%m-%d'))

#Finalmente, subimos el archivo al repositorio de datos
conn = pymysql.connect(password='******', database='covid', user='******')
conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
Beispiel #8
0
    def long_table(self,
                   key,
                   column,
                   key_rename=None,
                   retain_original=False,
                   prepend=True,
                   prepend_value=None):
        """
        Create a new long parsons table from a column, including the foreign
        key.

        .. code-block:: python

           # Begin with nested dicts in a column
           json = [{'id': '5421',
                    'name': 'Jane Green',
                    'emails': [{'home': '*****@*****.**'},
                               {'work': '*****@*****.**'}
                              ]
                   }
                  ]
           tbl = Table(json)
           print (tbl)
           >>> {'id': '5421', 'name': 'Jane Green', 'emails': [{'home': '*****@*****.**'}, {'work': '*****@*****.**'}]} # noqa: E501
           >>> {'id': '5421', 'name': 'Jane Green', 'emails': [{'home': '*****@*****.**'}, {'work': '*****@*****.**'}]} # noqa: E501

           # Create skinny table of just the nested dicts
           email_skinny = tbl.long_table(['id'], 'emails')

           print (email_skinny)
           >>> {'id': '5421', 'emails_home': '*****@*****.**', 'emails_work': None}
           >>> {'id': '5421', 'emails_home': None, 'emails_work': '*****@*****.**'}

        `Args:`
            key: lst
                The columns to retain in the long table (e.g. foreign keys)
            column: str
                The column name to make long
            key_rename: dict
                The new name for the foreign key to better identify it. For
                example, you might want to rename ``id`` to ``person_id``.
                Ex. {'KEY_NAME': 'NEW_KEY_NAME'}
            retain_original: boolean
                Retain the original column from the source table.
            prepend:
                Prepend the column name of the unpacked values. Useful for
                avoiding duplicate column names
            prepend_value:
                Value to prepend new columns if ``prepend=True``. If None, will
                set to column name.
        `Returns:`
            Parsons Table
                The new long table
        """

        if type(key) == str:
            key = [key]

        lt = self.cut(*key, column)  # Create a table of key and column
        lt.unpack_list(column, replace=True)  # Unpack the list
        lt.table = petl.melt(lt.table, key)  # Melt into a long table
        lt = lt.cut(*key, 'value')  # Get rid of column names created in unpack
        lt.rename_column('value', column)  # Rename 'value' to old column name
        lt.remove_null_rows(column)  # Remove null values

        # If a new key name is specified, rename
        if key_rename:
            for k, v in key_rename.items():
                lt.rename_column(k, v)

        # If there is a nested dict in the column, unpack it
        if lt.num_rows > 0 and isinstance(lt.table[column][0], dict):
            lt.unpack_dict(column,
                           prepend=prepend,
                           prepend_value=prepend_value)

        if not retain_original:
            self.remove_column(column)

        return lt
Beispiel #9
0
    def unpack_nested_columns_as_rows(self,
                                      column,
                                      key='id',
                                      expand_original=False):
        """
        Unpack list or dict values from one column into separate rows.
        Not recommended for JSON columns (i.e. lists of dicts), but can handle columns
        with any mix of types. Makes use of PETL's `melt()` method.

        `Args:`
            column: str
                The column name to unpack
            key: str
                The column to use as a key when unpacking. Defaults to `id`
            expand_original: boolean or int
                If `True`: Add resulting unpacked rows (with all other columns) to original
                If `int`: Add to original unless the max added per key is above the given number
                If `False` (default): Return unpacked rows (with `key` column only) as standalone
                Removes packed list and dict rows from original either way.
        `Returns:`
            If `expand_original`, original table with packed rows replaced by unpacked rows
            Otherwise, standalone table with key column and unpacked values only
        """

        if isinstance(expand_original, int) and expand_original is not True:
            lengths = {
                len(row[column])
                for row in self if isinstance(row[column], (dict, list))
            }
            max_len = sorted(lengths, reverse=True)[0]
            if max_len > expand_original:
                expand_original = False

        if expand_original:
            # Include all columns and filter out other non-dict types in table_list
            table = self
            table_list = table.select_rows(
                lambda row: isinstance(row[column], list))
        else:
            # Otherwise, include only key and column, but keep all non-dict types in table_list
            table = self.cut(key, column)
            table_list = table.select_rows(
                lambda row: not isinstance(row[column], dict))

        # All the columns other than column to ignore while melting
        ignore_cols = table.columns
        ignore_cols.remove(column)

        # Unpack lists as separate columns
        table_list.unpack_list(column, replace=True)

        # Rename the columns to retain only the number
        for col in table_list.columns:
            if f'{column}_' in col:
                table_list.rename_column(col, col.replace(f'{column}_', ""))

        # Filter dicts and unpack as separate columns
        table_dict = table.select_rows(
            lambda row: isinstance(row[column], dict))
        table_dict.unpack_dict(column, prepend=False)

        from parsons.etl.table import Table

        # Use melt to pivot both sets of columns into their own Tables and clean out None values
        melted_list = Table(petl.melt(table_list.table, ignore_cols))
        melted_dict = Table(petl.melt(table_dict.table, ignore_cols))

        melted_list.remove_null_rows('value')
        melted_dict.remove_null_rows('value')

        melted_list.rename_column('variable', column)
        melted_dict.rename_column('variable', column)

        # Combine the list and dict Tables
        melted_list.concat(melted_dict)

        import hashlib

        if expand_original:
            # Add unpacked rows to the original table (minus packed rows)
            orig = self.select_rows(
                lambda row: not isinstance(row[column], (dict, list)))
            orig.concat(melted_list)
            # Add unique id column by hashing all the other fields
            if 'uid' not in self.columns:
                orig.add_column(
                    'uid', lambda row: hashlib.md5(
                        str.encode(''.join([str(x)
                                            for x in row]))).hexdigest())
                orig.move_column('uid', 0)

            # Rename value column in case this is done again to this Table
            orig.rename_column('value', f'{column}_value')

            # Keep column next to column_value
            orig.move_column(column, -1)
            output = orig
        else:
            orig = self.remove_column(column)
            # Add unique id column by hashing all the other fields
            melted_list.add_column(
                'uid', lambda row: hashlib.md5(
                    str.encode(''.join([str(x) for x in row]))).hexdigest())
            melted_list.move_column('uid', 0)
            output = melted_list

        self = orig
        return output
Beispiel #10
0
def dimension_values():
    connection = psycopg2.connect(dbname='voyager',
                                  user='******',
                                  password='******',
                                  host='172.16.0.45')
    engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager')

    com = 'select id as id_component, name as component from dim_com'
    table_com = etl.fromdb(connection, com)

    loc = 'select id as id_location, name as name from dim_loc'
    table_loc = etl.fromdb(connection, loc)

    tim = 'select id as id_time, time as timestamp from dim_time'
    table_time = etl.fromdb(connection, tim)

    print(table_com)
    print(table_loc)
    print(table_time)

    for ran in range(0, 65424, 1000):
        sql = "select * from KNMI_station_data kk " \
              "RIGHT JOIN weatherstations w ON " \
              " CAST (kk.weather_station_id AS INTEGER)  = CAST (w.station_number AS INTEGER) " \
              "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran
        print(sql)
        table = etl.fromdb(connection, sql)

        print('knmi')
        print(table)
        table.log_progress()
        table = etl.convert(table, 'date', str)
        table = etl.convert(table, 'hour', str)

        table = etl.convert(table, 'temperature', int)
        table = etl.convert(table, 'temperature_dew', int)
        table = etl.convert(table, 'temperature_min', int)
        table = etl.convert(table, 'wind_speed_avg', int)
        table = etl.convert(table, 'wind_speed', int)
        table = etl.convert(table, 'wind_speed_max', int)

        table = etl.convert(table, 'temperature', lambda v: v / 10)
        table = etl.convert(table, 'temperature_dew', lambda v: v / 10)
        table = etl.convert(table, 'temperature_min', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_max', lambda v: v / 10)

        df = pd.DataFrame(table)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['date'] + df['hour']

        df['weather_station_id'] = df['weather_station_id'].astype(str)
        df['timestamp'] = df['timestamp'].apply(custom_to_datetime)
        df['timestamp'] = df['timestamp'].astype(str)

        df = df.drop(columns=['date', 'hour'], axis=1)

        final_knmi_table = etl.fromdataframe(df)

        final_knmi_table = etl.melt(final_knmi_table,
                                    key=[
                                        'weather_station_id', 'timestamp',
                                        'id', 'latitude', 'longitude', 'name',
                                        'station_number', 'data_source_id',
                                        'altitude'
                                    ])
        final_knmi_table = etl.rename(final_knmi_table, 'variable',
                                      'component')
        print(final_knmi_table)

        final_knmi_table2 = etl.join(final_knmi_table,
                                     table_com,
                                     key='component')
        final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name')
        final_knmi_table2 = etl.join(final_knmi_table2,
                                     table_time,
                                     key='timestamp')
        print('dos')

        print(final_knmi_table2)
        df = pd.DataFrame(final_knmi_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        fact_source = df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]

        print(fact_source)
        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')

    for rn in range(0, 1148, 1000):
        print('lmn')
        final_lmn_table = etl.fromdb(
            connection,
            "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, "
            "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name"
            " from luchtmeetnet_data ld "
            "right join luchtmeetnet_sensors ls on ld.formula = ls.formula "
            " join weatherstations ws on ld.station_number = ws.station_number "
            "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' "
            "LIMIT 1000 OFFSET %s" % rn)
        final_lmn_table = etl.rename(final_lmn_table,
                                     {'station_number': 'weather_station_id'})
        final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1)
        # print(final_lmn_table)
        # print(final_lmn_table)

        # print(table_com)
        final_lmn_table2 = etl.join(final_lmn_table,
                                    table_com,
                                    key='component')
        # print(final_lmn_table2)

        final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name')
        # print(final_lmn_table2)
        df = pd.DataFrame(final_lmn_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['timestamp'].str[:-6]
        # print(df)

        final_lmn_table2 = etl.fromdataframe(df)

        final_lmn_table2 = etl.join(final_lmn_table2,
                                    table_time,
                                    key='timestamp')
        # print(final_lmn_table2)

        print(final_lmn_table2)
        final_lmn_df = pd.DataFrame(final_lmn_table2)
        final_lmn_df.columns = final_lmn_df.iloc[0]
        final_lmn_df = final_lmn_df.drop(0)
        fact_source = final_lmn_df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]
        print(fact_source)

        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')
Beispiel #11
0
from __future__ import absolute_import, print_function, division


# melt()
########

import petl as etl

table1 = [["id", "gender", "age"], [1, "F", 12], [2, "M", 17], [3, "M", 16]]
table2 = etl.melt(table1, "id")
table2.lookall()
# compound keys are supported
table3 = [["id", "time", "height", "weight"], [1, 11, 66.4, 12.2], [2, 16, 53.2, 17.3], [3, 12, 34.5, 9.4]]
table4 = etl.melt(table3, key=["id", "time"])
table4.lookall()
# a subset of variable fields can be selected
table5 = etl.melt(table3, key=["id", "time"], variables=["height"])
table5.lookall()


# recast()
##########

import petl as etl

table1 = [
    ["id", "variable", "value"],
    [3, "age", 16],
    [1, "gender", "F"],
    [2, "gender", "M"],
    [2, "age", 17],
Beispiel #12
0
def procesar_fuente(path, nombre):
    try: 
        # Procesamos primero casos confirmados
        tabla = etl.fromcsv(path)

        # Cambiamos el nombre a los encabezados
        tabla = etl.rename(tabla, {'Country/Region': 'Country'})

        # Ajustamos los tipos de datos
        # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos
        # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header
        headers = etl.fieldnames(tabla)
        i=0
        for header in headers:
            if i>=4:
                tabla = etl.convert(tabla, header, int)        # corregimos el tipo de dato
                fecha =  datetime.datetime.strptime(header, '%m/%d/%y')    # calculamos la fecha en formato correcto
                tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d'))   
            i = i + 1

        # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar
        tabla = etl.cutout(tabla, 0, 2, 3)

        # Ajustamos algunos nombres de países para luego asignarles una región/continente
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea')
        tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar')
        tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan')

        # Luego procedemos a agrupar y acumular los resultados por el país
        df_confirmed = etl.todataframe(tabla)
        df = df_confirmed.groupby(['Country']).sum()
        tabla = etl.fromdataframe(df, include_index=True)

        # Renombramos el campo de Country nuevamente
        tabla = etl.rename(tabla, {'index': 'Country'})

        # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas
        tabla = etl.melt(tabla, 'Country')
        tabla = etl.rename(tabla, {'variable': 'Date'})
        tabla = etl.rename(tabla, {'value': 'Cases'})

        # Luego agregamos el continente para agrupar
        tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country']))

        # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser.
        tabla = etl.convert(tabla, 'Cases', int)
        tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') )

        #Finalmente, subimos el archivo al repositorio de datos
        conn = pymysql.connect(password='******', database='covid', user='******')
        conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
        etl.todb(tabla, conn, nombre, create=True, drop=True)
        conn.close()
    except:
        print('Se ha presentado un error! ', sys.exc_info()[0])
        raise