def arizona():
    file = requests.get(
        "https://adhsgis.maps.arcgis.com/sharing/rest/content/items/8a2c089c866940bbac0ee70a41ea27bd/data",
        allow_redirects=True)

    # Saves file locally and loads into dataframe. READ_EXCEL REQUIRES XLRD DEPENDENCY.
    open("COVID19CONFIRMED_BYZIP_excel.xls", 'wb').write(file.content)
    df = pd.read_excel("COVID19CONFIRMED_BYZIP_excel.xls", usecols="A,C")

    df.drop(df[df['ConfirmedCaseCount'] == "Data Suppressed"].index,
            inplace=True)
    # The code below is used to populate the database on first run with city names for each ZIP code.
    # citynames = [get_city_names(row) for row in df['POSTCODE']]
    # df.insert(loc=1, column="City Name", value=citynames)

    # Uploads dataframe to Postgres database.
    table_name = 'arizona'
    engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI'))

    df.set_index('POSTCODE', inplace=True)
    df = pangres.fix_psycopg2_bad_cols(df)
    pangres.upsert(engine=engine,
                   df=df,
                   table_name=table_name,
                   if_row_exists='update')
def salva_dados_cadastrais_remoto(df, engine):
    try:
        print('Salvando dados cadastrais no banco de dados remoto...')
        
        #df=df[df['COD_CNPJ']=='97711801000105']
        df.set_index(['TP_FUNDO', 'COD_CNPJ'], inplace=True)

        # it does not matter if if_row_exists is set
        # to "update" or "ignore" for table creation
        upsert(engine=engine,
            df=df,
            table_name='dados_cadastrais',
            if_row_exists='update'
            #,dtype=dtype
        )
    except IndexError as err:
        print(f'Falha de índice ao salvar registros dos dados cadastrais no banco de dados remoto...', err)
        print ('Índice', df.index.names)
        print(df.index[df.index.duplicated(keep=False)])
        print(type(err))    # the exception instance
        print(err.args)     # arguments stored in .args
    except Exception as err:
        print(f'Falha ao salvar registros dos dados cadastrais no banco de dados remoto...', err)
        print(type(err))    # the exception instance
        print(err.args)     # arguments stored in .args
        return None   
def test_commit_as_you_go(engine, schema):
    df = pd.DataFrame(index=pd.Index(['foo'], name='ix'))
    table_name = TableNames.COMMIT_AS_YOU_GO
    # common keyword arguments for multiple upsert operations below
    common_kwargs = dict(schema=schema,
                         table_name=table_name,
                         if_row_exists='update',
                         dtype={'ix': VARCHAR(3)})

    with engine.connect() as con:
        # skip for sqlalchemy < 2.0 or when future=True flag is not passed
        # during engine creation (commit-as-you-go is a new feature)
        # when this is the case there is no attribute commit or rollback for
        # the connection
        if not hasattr(con, 'commit'):
            pytest.skip(
                'test not possible because there is no attribute "commit" (most likely sqlalchemy < 2)'
            )

        # do some random upsert operation and commit
        upsert(con=con, df=df, **common_kwargs)
        con.commit()

        # do some other operation that requires commit and then rollback
        upsert(con=con, df=df.rename(index={'foo': 'bar'}), **common_kwargs)
        con.rollback()

    # the table in the db should be equal to the initial df as the second
    # operation was rolled back
    df_db = select_table(engine=engine,
                         schema=schema,
                         table_name=table_name,
                         index_col='ix')
    pd.testing.assert_frame_equal(df_db, df)
Example #4
0
def florida():
    # Gets url to download file.
    source = requests.get(
        "https://services1.arcgis.com/CY1LXxl9zlJeBuRZ/arcgis/rest/services/Florida_Cases_Zips_COVID19/FeatureServer/0/query?where=0%3D0&outFields=*&outFields=ZIP,COUNTYNAME,POName,Cases_1&returnGeometry=false&f=json"
    )

    # Retrieves filename.
    data = json.loads(source.text)

    rows = []
    for element in data["features"]:
        rows.append(element["attributes"])

    df = pd.DataFrame(rows)
    cols = [0, 1, 4, 6, 7, 11]
    df = df[df.columns[cols]]

    # Uploads dataframe to Postgres database.
    table_name = 'florida'
    engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI'))

    df.sort_values(by=['ZIP'], inplace=True)

    # Some ZIPs span multiple counties, but data is reported separately
    # for each county, so ZIP cannot be used as index column.
    df.set_index("OBJECTID", inplace=True)

    df = pangres.fix_psycopg2_bad_cols(df)
    pangres.upsert(engine=engine,
                   df=df,
                   table_name=table_name,
                   if_row_exists='update')
Example #5
0
def test_crappy_text_insert(engine, schema):
    is_mysql = 'mysql' in engine.dialect.dialect_description
    dtype = {'profileid':VARCHAR(10)} if is_mysql else None
    
    # mix crappy letters with a few normal ones
    crap_char_seq = """/_- ?§$&"',:;*()%[]{}|<>=!+#""" + "\\" + "sknalji"  

    # add columns with crappy names
    # don't do this for MySQL which has more strict rules for column names 
    if not is_mysql:
        for i in range(5):
            random_crappy_col_name = ''.join([random.choice(crap_char_seq)
                                              for i in range(50)])

            df_test = (pd.DataFrame({random_crappy_col_name: ['test', None]})
                       .rename_axis(['profileid'], axis='index', inplace=False))

            # psycopg2 can't process columns with "%" or "(" or ")"
            df_test = fix_psycopg2_bad_cols(df_test)
            upsert(engine=engine, schema=schema, df=df_test, if_row_exists='update', dtype=dtype, **default_args)

    # add crappy text in a column named 'text'
    create_random_text = lambda: ''.join([random.choice(crap_char_seq)
                                          for i in range(10)])

    df_test = (pd.DataFrame({'text': [create_random_text() for i in range(10)]})
               .rename_axis(['profileid'], axis='index', inplace=False))
    upsert(engine=engine, schema=schema, df=df_test, if_row_exists='update', dtype=dtype, **default_args)
Example #6
0
def load(**kwargs):
    '''
    Loads the provided file into a Database
    '''
    working_dir = kwargs.get('working_dir')
    worldcities_file = kwargs.get('worldcities_file')

    source_file = os.path.join(working_dir, worldcities_file)
    print(source_file)

    logging.info('Loading %s', source_file)
    frame: pd.DataFrame = pd.read_csv(source_file, index_col=['id'])

    logger.info('Loaded %s rows, %s columns', *frame.shape)
    log_dataframe(frame, logger)

    if not kwargs.get('publish'):
        logger.warning('Publish flag not set, skipping')
        return

    engine: Engine = sqlalchemy.create_engine(kwargs.get('write_url'))

    table = worldcities_file.split('.')[0]
    logger.info('Writing frame to %s', table)
    pangres.upsert(engine, frame, table, if_row_exists='update', create_schema=True, add_new_columns=True)

    engine.dispose()
Example #7
0
def test_create_and_insert_table_multiindex(engine, schema):
    table_name = 'test_multiindex'
    namespace = f'{schema}.{table_name}' if schema is not None else table_name
    drop_table_if_exists(engine=engine, schema=schema, table_name=table_name)
    # dtype for index for MySQL... (can't have flexible text length)
    dtype = {
        'ix2': VARCHAR(5)
    } if 'mysql' in engine.dialect.dialect_description else None

    # create
    upsert(engine=engine,
           schema=schema,
           df=df_multiindex,
           table_name=table_name,
           dtype=dtype,
           **default_args)
    df_db = pd.read_sql(f'SELECT * FROM {namespace}',
                        con=engine,
                        index_col=index_col)

    # insert
    upsert(engine=engine,
           schema=schema,
           df=df_multiindex2,
           table_name=table_name,
           dtype=dtype,
           **default_args)
    df_db = pd.read_sql(f'SELECT * FROM {namespace}',
                        con=engine,
                        index_col=index_col)
Example #8
0
def test_create_table(engine, schema):
    # dtype for index for MySQL... (can't have flexible text length)
    dtype = {'profileid':VARCHAR(10)} if 'mysql' in engine.dialect.dialect_description else None
    
    drop_table_if_exists(engine=engine, schema=schema, table_name=table_name)
    upsert(engine=engine, schema=schema, df=df, if_row_exists='update', dtype=dtype, **default_args)
    df_db = read_example_table_from_db(engine=engine, schema=schema, table_name=table_name)
    pd.testing.assert_frame_equal(df, df_db)
Example #9
0
def test_upsert_ignore(engine, schema):
    dtype = {'profileid':VARCHAR(10)} if 'mysql' in engine.dialect.dialect_description else None
    
    drop_table_if_exists(engine=engine, schema=schema, table_name=table_name)
    for _df in (df, df3):
        upsert(engine=engine, schema=schema, df=_df, if_row_exists='ignore', dtype=dtype, **default_args)
    df_db = read_example_table_from_db(engine=engine, schema=schema, table_name=table_name)
    expected = pd.concat((df, df3.tail(1)), axis=0)
    pd.testing.assert_frame_equal(expected, df_db)
Example #10
0
def test_index_with_null(engine, schema):
    df = pd.DataFrame({'ix':[None], 'foo': [2]}).set_index('ix')
    table_name='test_index_with_null'
    drop_table_if_exists(engine=engine, schema=schema, table_name=table_name)
    # don't test for mysql since only a warning is raised and the line is skipped
    if not 'mysql' in engine.dialect.dialect_description:
        try:
            upsert(engine=engine, schema=schema, df=df, table_name=table_name, **default_args)
            raise ValueError('upsert did not fail as expected with null value in index')
        except IntegrityError as e:
            print(f'upsert failed as expected with null value in index. Error was:\n\n{e}')
Example #11
0
    def _build_index(self):
        def with_index(df, type, column, new_name="symbol"):
            if not "country" in df:
                df["country"] = "unknown"
            else:
                df["country"] = df["country"].replace({
                    None: "unknown",
                    "": "unknown"
                }).fillna('unknown')

            df.index = pd.MultiIndex.from_tuples([
                (s, type, c)
                for s, c in zip(df[column].to_list(), df["country"].to_list())
            ]).rename([
                new_name if new_name is not None else column, "type", "country"
            ])
            df = df.drop([column, "country"], axis=1)
            df = df.drop(df.index[df.index.duplicated('first')], axis=0)
            return df.loc[df.index.dropna()]

        symbols_df = pd.concat(
            [
                with_index(ip.get_bonds(), "BOND",
                           "name"),  # country	"name"	full_name
                with_index(
                    ip.get_certificates(), "CERT", "symbol"
                ),  # country', 'name', 'full_name', '"symbol"', 'issuer', 'isin', 'asset_class', 'underlying'
                with_index(ip.get_cryptos(), "CRYPTO",
                           "symbol"),  # 'name', '"symbol"', 'currency'
                with_index(
                    ip.get_commodities(), "COMM", "name"
                ),  # 'title', 'country', '"name"', 'full_name', 'currency', 'group'
                with_index(
                    ip.get_etfs(), "ETF", "symbol"
                ),  # 'country', 'name', 'full_name', '"symbol"', 'isin', 'asset_class', 'currency', 'stock_exchange', 'def_stock_exchange'
                # with_index(ip.get_funds(), "FUND", "isin"),             # 'country', 'name', 'symbol', 'issuer', '"isin"', 'asset_class', 'currency', 'underlying'
                with_index(
                    ip.get_indices(), "INDEX", "symbol"
                ),  # 'country', 'name', 'full_name', '"symbol"', 'currency', 'class', 'market'
                with_index(
                    ip.get_stocks(), "STOCK", "symbol"
                ),  # ['country', 'name', 'full_name', 'isin', 'currency', '"symbol"'
                with_index(
                    pd.DataFrame(
                        [f'{c}/USD' for c in ip.get_available_currencies()],
                        columns=['symbol']), "FX", "symbol")
            ],
            axis=0)

        # update the index table
        upsert(self.engine,
               symbols_df,
               DataProvider.symbols_table_name,
               if_row_exists='ignore')
Example #12
0
    def update_symbols(self, **kwargs):
        with self.engine.connect() as con:
            symbols = set(
                con.execute(
                    f'SELECT * FROM {DataProvider.symbols_table_name}'))

        ticker_finder = TickerFinder(symbols)
        for symbols_df in ticker_finder.fetch():
            upsert(self.engine,
                   symbols_df,
                   DataProvider.symbols_table_name,
                   if_row_exists='ignore')
Example #13
0
def test_only_index(engine, schema, if_row_exists):
    # upsert df with only index
    df = pd.DataFrame({'ix':[1]}).set_index('ix')
    table_name='test_index_only'
    drop_table_if_exists(engine=engine, schema=schema, table_name=table_name)
    upsert(engine=engine, schema=schema, df=df, table_name=table_name, if_row_exists=if_row_exists)

    # check data integrity
    namespace = f'{schema}.{table_name}' if schema is not None else table_name
    df_db = pd.read_sql(f'SELECT * FROM {namespace}', con=engine)
    assert 'ix' in df_db.columns
    assert len(df_db) > 0
    assert df_db['ix'].iloc[0] == 1
def test_connection_usable_after_upsert(engine, schema):
    df = pd.DataFrame(index=pd.Index([0], name='ix'))
    with engine.connect() as con:
        # do some random upsert operation
        upsert(con=con,
               df=df,
               schema=schema,
               table_name=TableNames.REUSE_CONNECTION,
               if_row_exists='update')
        # attempt to reuse the connection
        result = con.execute(text('SELECT 1;')).scalar()
        assert result == 1
        commit(con)
Example #15
0
def massachusetts():
    # Gets url to download file.
    domain = "https://www.mass.gov"
    source = requests.get(
        "https://www.mass.gov/info-details/covid-19-response-reporting")
    soup = BeautifulSoup(source.content, 'html.parser')
    element = soup.find(text=re.compile(
        'Raw data used to create the Weekly Public Health Report'))
    element = element.find_next_sibling('a')
    download_url = domain + element['href']
    headers = {
        "Host": "www.mass.gov",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Referer":
        "https://www.mass.gov/info-details/covid-19-response-reporting"
    }

    # Retrieves filename.
    file = requests.get(download_url, allow_redirects=True)
    filename = file.headers.get('content-disposition')
    filename = re.findall('filename=(.+)', filename)
    filename = re.findall(
        r'"([^"]*)"', filename[0])  #regex to find text within double quotes
    # At this stage, filename[0] gives filename.

    # Saves file locally and loads into dataframe. READ_EXCEL REQUIRES XLRD DEPENDENCY.
    open(filename[0], 'wb').write(file.content)
    df = pd.read_excel(filename[0],
                       sheet_name="City_town",
                       usecols="A:D,H,I",
                       na_values="*")
    df['Percent positivity'] = df['Percent positivity'].multiply(100)
    df = df.where(df != "<5", df["Positive Tests Last 14 days"], axis=0)
    df = df.drop(columns=["Positive Tests Last 14 days"])
    # Uploads dataframe to Postgres database.
    table_name = 'massachusetts'
    engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI'))

    df.set_index('City/Town', inplace=True)
    df = pangres.fix_psycopg2_bad_cols(df)
    pangres.upsert(engine=engine,
                   df=df,
                   table_name=table_name,
                   if_row_exists='update')
def carrega_informe_remoto(informe_diario_df, engine):
    print('Inserindo informe diário no banco de dados remoto...')
    informe_diario_df.set_index(['COD_CNPJ', 'DT_REF'], inplace=True)

    # it does not matter if if_row_exists is set
    # to "update" or "ignore" for table creation
    upsert(engine=engine,
        df=informe_diario_df,
        table_name='informe_diario',
        if_row_exists='update'
        #,dtype=dtype
    )
    
    print('Finalizada inserção de informe diário no banco de dados remoto...')
Example #17
0
def test_get_nb_rows(engine, schema):
    # config
    table_name = TableNames.WITH_YIELD
    nb_rows, chunksize = 20, 3
    nb_last_chunk = nb_rows % chunksize
    nb_chunks = math.ceil(nb_rows / chunksize)
    # MySQL does not want flexible text length in indices/PK
    dtype = {
        'profileid': VARCHAR(10)
    } if 'mysql' in engine.dialect.dialect_description else None
    df = _TestsExampleTable.create_example_df(nb_rows=nb_rows)

    # iterate over upsert results
    # make sure we can extract the number of updated rows and that it is correct
    iterator = upsert(con=engine,
                      df=df,
                      table_name=table_name,
                      if_row_exists='update',
                      schema=schema,
                      chunksize=chunksize,
                      dtype=dtype,
                      yield_chunks=True)

    for ix, result in enumerate(iterator):
        assert result.rowcount == (chunksize
                                   if ix != nb_chunks - 1 else nb_last_chunk)

    # verify the inserted data is as expected
    # we sort the index for MySQL
    df_db = read_example_table_from_db(engine=engine,
                                       schema=schema,
                                       table_name=table_name)
    pd.testing.assert_frame_equal(df.sort_index(), df_db.sort_index())
Example #18
0
def upsert_df_to_postgres(df, table_name="oura_api"):
    """Upsert a data frame to a table in Postgres DB"""
    connect = f"postgresql+psycopg2://%s:%s@%s:{db_port.get()}/%s" % (
        db_user.get(),
        db_pass.get(),
        db_host.get(),
        db_db.get(),
    )

    engine = create_engine(connect)

    pg.upsert(
        engine=engine,
        df=df,
        table_name=table_name,
        schema="raw",
        if_row_exists="update",
    )
Example #19
0
    def upsert_arrays(self, df: pd.DataFrame):
        """
        Upsert dataframe to db using pangres

        Parameters
        ----------
        df: pd.DataFrame
            Arrays to upsert
        """
        if not df.empty:
            pangres.upsert(self.engine,
                           df,
                           if_row_exists='update',
                           table_name=RtdArrays.__tablename__,
                           dtype=sql_types,
                           create_schema=False,
                           add_new_columns=False,
                           adapt_dtype_of_empty_db_columns=False)
Example #20
0
def upsert_df_to_postgres(df, table_name='toggl_api'):
    '''Upsert a data frame to a table in Postgres DB'''
    connect = f"postgresql+psycopg2://%s:%s@%s:{db_port.get()}/%s" % (
        db_user.get(),
        db_pass.get(),
        db_host.get(),
        db_db.get()
    )

    engine = create_engine(connect)

    pg.upsert(
        engine=engine,
        df=df,
        table_name=table_name,
        schema='raw',
        if_row_exists='update'
    )
Example #21
0
def createFlatTable(queryDF, tablename):
    try:
        print('hi ' + tablename)
        upsert(
            engine=engine,
            df=queryDF,
            table_name=tablename,
            if_row_exists='update',
            add_new_columns=True,
            schema='arcgisuser',
        )
        print('Seems like it works!')
        print('checking/adding oid field')

        addObjectIDandCreateView(tablename)

    except Exception as error:
        print("wahwah" + str(error))
        pass
Example #22
0
def insert_chunks(engine, schema, chunksize, nb_rows):
    df = _TestsExampleTable.create_example_df(nb_rows=nb_rows)
    table_name = f'test_insert_chunksize_{chunksize}'
    drop_table_if_exists(engine=engine, schema=schema, table_name=table_name)
    upsert(
        schema=schema,
        table_name=table_name,
        df=df,
        chunksize=chunksize,
        engine=engine,
        if_row_exists='update',
        # MySQL does not want flexible text length in indices/PK
        dtype={'profileid': VARCHAR(10)}
        if 'mysql' in engine.dialect.dialect_description else None)
    df_db = read_example_table_from_db(engine=engine,
                                       schema=schema,
                                       table_name=table_name)
    # sort index (for MySQL...)
    pd.testing.assert_frame_equal(df.sort_index(), df_db.sort_index())
    def upsert_rtd(rtd: pd.DataFrame):
        """
        Upsert dataframe to db using pangres

        Parameters
        ----------
        rtd: pd.DataFrame
            Data to upsert
        """
        if not rtd.empty:
            engine = get_engine()
            pangres.upsert(engine,
                        rtd,
                        if_row_exists='update',
                        table_name=Rtd.__tablename__,
                        dtype=sql_types,
                        create_schema=False,
                        add_new_columns=False,
                        adapt_dtype_of_empty_db_columns=False)
            engine.dispose()
def test_transaction(engine, schema, trans_op):
    df = pd.DataFrame(index=pd.Index(['foo'], name='ix'))
    table_name = TableNames.COMMIT_OR_ROLLBACK_TRANS
    # common keyword arguments for multiple upsert operations below
    common_kwargs = dict(schema=schema,
                         table_name=table_name,
                         if_row_exists='update',
                         dtype={'ix': VARCHAR(3)})

    with engine.connect() as con:
        trans = con.begin()
        try:
            # do some random upsert operation
            upsert(con=con, df=df, **common_kwargs)
            # do some other operation that requires commit
            upsert(con=con,
                   df=df.rename(index={'foo': 'bar'}),
                   **common_kwargs)
            getattr(trans, trans_op)()  # commit or rollback
        finally:
            trans.close()

    # if trans_op=='commit': make sure we have "bar" and "foo" in the index
    # elif trans_op=='rollback': make sure we don't have any data
    # or that the table was not even created (what is rolled back
    # depends on the database type and other factors)
    if trans_op == 'commit':
        df_db = select_table(engine=engine,
                             schema=schema,
                             table_name=table_name,
                             index_col='ix')
        pd.testing.assert_frame_equal(
            df_db.sort_index(),
            pd.DataFrame(index=pd.Index(['bar', 'foo'], name='ix')))
    elif trans_op == 'rollback':
        df_db = select_table(engine=engine,
                             schema=schema,
                             table_name=table_name,
                             error_if_missing=False)
        # no table or an empty table
        assert df_db is None or len(df_db) == 0
Example #25
0
def df_to_db(df, table_name,schema=None, index_name='index'):
    """
    Writes a DataFrame to a the specified table in the PostgreSQL database.\n
    If the table exisits, it will update the rows and insert new rows, otherwise it will create the table.\n
    This uses environment variables to access the DB. Make sure your .env file contains the following (replace with the relevant data):\n
    DB_USER= '******'
    DB_PW = 'super_secret_password'
    DB_URL = 'db_address'
    DB_NAME = 'my_exciting_db_name'
    Parameters
    ----------
    df : DataFrame
        The DataFrame to write to the db. Make sure your columns of of the dtype you want in the db.
    table_name : str
        The `table_name` to update or create the table with in the DB.
    schema : str, optional
        The schema where the table should be located. (default in None, which refers to the `public` schema)
    index_name : str, optional
        The index name (must be the index of your df). Default is `index`.

    """
    #'postgresql://*****:*****@db_address/db'
    try:
        engine = create_engine('postgresql://'+os.environ.get('DB_USER') +':'+os.environ.get('DB_PW')+'@'+os.environ.get('DB_URL')+'/'+ os.environ.get('DB_NAME'))
    except Exception as e:
        print(e)
        print('Could not establish connection to db. Please check credentials in .env file')
        sys.exit(1)
    try:
        df.index.name = index_name
        upsert(engine=engine,
            df=df,
            table_name=table_name,
            if_row_exists='update',
            schema=schema,
            dtype=None)
    except Exception as e:
        print(e)
        print('Could not write data to the specified table, check that the db credentials in .env file are correct and have write permissions')
        sys.exit(1)
Example #26
0
def test_upsert_with_unique_keys(engine, schema):

    # helpers
    namespace = f'{schema}.{table_name}' if schema is not None else table_name
    read_from_db = lambda: pd.read_sql(
        f'SELECT * FROM {namespace}', con=engine, index_col='row_id')

    # create our test table
    drop_table_if_exists(engine=engine, schema=schema, table_name=table_name)
    create_test_table(engine=engine, schema=schema)

    # add initial data (df_old)
    upsert(engine=engine,
           df=df_old,
           schema=schema,
           table_name='test_unique_key',
           if_row_exists='update')
    df = read_from_db()
    df_expected = df_old.assign(
        row_id=range(1, 4)).reset_index().set_index('row_id')
    pd.testing.assert_frame_equal(df, df_expected)

    # add new data (df_new)
    upsert(engine=engine,
           df=df_new,
           schema=schema,
           table_name='test_unique_key',
           if_row_exists='update')
    df = read_from_db()
    # before creating our expected df we need to implement the special case of postgres
    # where the id of the last row will be 7 instead of 4. I suppose that PG's ON
    # CONFLICT UPDATE clause will run in such a way that it will count 4 (number we
    # would expected) + 3 (three previous rows that were updated)
    last_row_id = 7 if 'postgres' in engine.dialect.dialect_description else 4
    df_expected = (pd.DataFrame(
        [[1, 'A0001', 'PD100', 10], [2, 'A0002', 'PD200', 20],
         [3, 'A0002', 'PD201', 77], [last_row_id, 'A0003', 'PD300', 30]],
        columns=['row_id'] +
        df_old.reset_index().columns.tolist()).set_index('row_id'))
    pd.testing.assert_frame_equal(df, df_expected)
def update_footballers_table(engine):
    """
    Calculate start date, end date and tweet counts for each footballer(search).
    Create new columns for each of them and insert them to the footballers table in the database.
    @type engine: object
    """
    start_end_date_df = pd.read_sql_query('''select *
                    from (
                    select distinct on (fb.search) fb.search, tw.date as first_tweet_date
                    from footballers fb
                    join screen_name_tweets snt ON snt.search = fb.search
                    join tweets tw on tw.id = snt.id
                    order by fb.search, tw.date
                    )f
                    join(
                    select distinct on (fb.search) fb.search, tw.date as last_tweet_date
                    from footballers fb
                    join screen_name_tweets snt ON snt.search = fb.search
                    join tweets tw on tw.id = snt.id
                    order by fb.search, tw.date desc
                    )l USING (search)''',
                                          con=engine)

    count_tweets_df = pd.read_sql_query('''select count(search), search
                                    from screen_name_tweets
                                    group by search
                                    order by count(search) desc''',
                                        con=engine)

    footballers_complete_table = pd.merge(start_end_date_df,
                                          count_tweets_df,
                                          on="search")
    footballers_complete_table.set_index('search', inplace=True)
    # update data if the row(on the primary key column) that is being inserted already exists in the table.
    upsert(engine=engine,
           df=footballers_complete_table,
           table_name='footballers',
           if_row_exists='update')
Example #28
0
def insert_data(dataframe, csv_file, engine):
    users_table = dataframe[["user_id", "user_id_str", "username", "name"]]
    print(f"---------------{csv_file}users table is created---------")
    users_table.drop_duplicates(subset="user_id", inplace=True)
    users_table.set_index('user_id', inplace=True)
    upsert(engine=engine,
           df=users_table,
           table_name='users',
           if_row_exists='update')
    print(f" {csv_file} Users table is inserted to the database")

    # Tweets table
    tweets_table = dataframe[[
        "id", "conversation_id", "user_id", "created_at", "date", "timezone",
        "place", "tweet", "language", "hashtags", "cashtags", "day", "hour",
        "link", "urls", "photos", "video", "thumbnail", "retweet", "nlikes",
        "nreplies", "nretweets", "quote_url"
    ]]
    tweets_table["created_at_parsed"] = tweets_table["created_at"].apply(
        parse_time)
    print(f"---------------{csv_file} tweets table printed---------")
    tweets_table.drop_duplicates(subset=['id'],
                                 inplace=True,
                                 ignore_index=True)
    tweets_table.set_index('id', inplace=True)
    upsert(engine=engine,
           df=tweets_table,
           table_name='tweets',
           if_row_exists='update')
    print(
        f"---------------{csv_file} table was written in the database---------"
    )

    # footballers table
    footballers_table = dataframe[['search']]
    footballers_table.drop_duplicates(subset=['search'],
                                      inplace=True,
                                      ignore_index=True)
    footballers_table.set_index('search', inplace=True)
    upsert(engine=engine,
           df=footballers_table,
           table_name='footballers',
           if_row_exists='update')
    print(f" {csv_file} Footballers table is inserted to the database")

    # screen_name_tweets table
    screen_name_tweets_table = dataframe[["id", "reply_to", "search"]]
    print(
        f"---------------{csv_file} screen_name_tweets table is created---------"
    )
    screen_name_tweets_table.to_sql('screen_name_tweets',
                                    engine,
                                    if_exists='append',
                                    index=False,
                                    chunksize=100)
    print(f" {csv_file} Screen_name_tweets table is inserted to the database")
Example #29
0
def test_yield_empty_df(engine, schema):
    df = pd.DataFrame({'id': [], 'value': []}).set_index('id')

    # we should get an empty generator back
    iterator = upsert(con=engine,
                      df=df,
                      table_name=TableNames.WITH_YIELD_EMPTY,
                      if_row_exists='update',
                      schema=schema,
                      dtype={
                          'id': INT,
                          'value': INT
                      },
                      yield_chunks=True)

    # the for loop should never run because the generator should be empty
    for result in iterator:
        raise AssertionError(
            'Expected the generator returned by upsert with an empty df to be empty'
        )
def create_projections_table():
    # create the covid_projections db
    engine = create_engine(app_config['sqlalchemy_database_uri'], echo=True)

    # Make same changes as in the load_projections function
    df = pd.read_csv(os.path.join('data', 'merged_projections.csv'), nrows=50)

    dtypes = [
        'category',
        'str',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'category',
        'category',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
        'float32',
    ]

    pd_dtypes = dict(zip(df.columns, dtypes))

    df = pd.read_csv(os.path.join('data', 'merged_projections.csv'),
                     dtype=pd_dtypes)
    df = df[df.model_version != '2020_04_05.05.us']

    df['date'] = pd.to_datetime(df['date'])
    df['model_date'] = pd.to_datetime(
        df['model_version'].str[0:10].str.replace('_', '-'))
    df['location_abbr'] = df['location_name'].map(us_state_abbrev)
    # df = df[df['model_date'] > (datetime.today() - timedelta(days=31))] # only loading model versions from the past 31 days
    index_col = ['location_name', 'date', 'model_date', 'model_name']
    df.set_index(index_col, inplace=True)
    df = df[~df.index.duplicated()]
    # drop old table and insert new table
    # df.to_sql(app_config['database_name'], con=engine, if_exists='replace', method='multi', chunksize=1000) #Todo: Do we want to specify data types in the table?
    # 'ALTER TABLE projections ADD PRIMARY KEY (location_name, date, model_date, model_name);'
    # This upsert package requires us to name the index
    # df.index.name = 'index'

    upsert(engine=engine,
           df=df,
           table_name=app_config['database_name'],
           if_row_exists='ignore',
           chunksize=5000,
           add_new_columns=False,
           create_schema=False)