Ejemplo n.º 1
0
    def test_upload_data_if_table_exists_replace(self):

        raise nose.SkipTest("buggy test")

        destination_table = DESTINATION_TABLE + "4"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
                   private_key=_get_private_key_path())

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema, destination_table,
                   _get_project_id(), if_exists='replace',
                   private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=_get_project_id(),
                              private_key=_get_private_key_path())
        self.assertEqual(result['NUM_ROWS'][0], 5)
Ejemplo n.º 2
0
def main():
    query_android = "SELECT distinct if (af_channel is not null, concat(media_source,'_',af_channel),media_source) as source," \
                    "ifnull(campaign,fb_campaign_name) as campaign, regexp_extract(event_value,'09835[0-9]{7}') as orders " \
                    "FROM AppsFlyer.android_events " \
                    "WHERE regexp_extract(event_value, '09835[0-9]{7}') is not null and media_source!='Organic' " \
                    f"and _PARTITIONTIME = TIMESTAMP('{currdate}')"

    query_ios = "SELECT distinct if (af_channel is not null, concat(media_source,'_',af_channel),media_source) as source," \
                "ifnull(campaign,fb_campaign_name) as campaign, regexp_extract(event_value,'09835[0-9]{7}') as orders " \
                "FROM AppsFlyer.ios_events " \
                "WHERE regexp_extract(event_value , '09835[0-9]{7}') is not null and media_source!='Organic' " \
                f"and _PARTITIONTIME = TIMESTAMP('{currdate}')"
    app = [[row[i] for i in range(3)] for row in queryData(query_android)] + [[row[i] for i in range(3)] for row in queryData(query_ios)]

    if len(app) > 0:
        # app = list(dict((x[2], x) for x in app).values())  # убираем дубли транзакций - оставил для истории )
        df_app = pd.DataFrame.from_records(app, columns=['source','campaign','orders'])
        df_app.drop_duplicates(subset='orders', inplace=True)  # убираем дубли транзакций
        gbq.to_gbq(df_app, 'Mig_Data.temp_app', projectid, if_exists=import_action)
        QUERY_upd = f"UPDATE `Mig_Data.Orders` o SET o.source = t.source, o.campaign=t.campaign " \
                    f"FROM `Mig_Data.temp_app` t " \
                    f"WHERE o.transaction = t.orders"
        queryData(QUERY_upd)
        table_ref = client.dataset('Mig_Data').table('temp_app')
        if client.get_table(table_ref):
            client.delete_table(table_ref)
Ejemplo n.º 3
0
def load_albums_bq(albums):
    """Loads json to BigQuery table."""
    from pandas.io import gbq

    # Constants
    project_id = 'secret-compass-181513'
    dataset_name = 'spotify'

    dic_flattened = [flatten_json(d) for d in albums['items']]
    df = pd.DataFrame(dic_flattened)

    # Save albums in Google BigQuery
    bigquery_client = bigquery.Client(project=project_id)
    dataset = bigquery_client.dataset(dataset_name)

    df_recent_albums = df[[
        'album_uri', 'added_at', 'album_artists_0_name', 'album_name',
        'album_type', 'album_label', 'album_release_date',
        'album_tracks_total', 'album_id'
    ]]

    gbq.to_gbq(df_recent_albums,
               'spotify.new_albums',
               project_id,
               if_exists='append')

    return 'dataset loaded to BQ'
Ejemplo n.º 4
0
def userIdData():
    p = userIdDict()
    for dt, u_id in p.items():
        try:
            QUERY = "with a as ( SELECT distinct date, user.id, " \
                    "trafficSource.source,  trafficSource.medium, trafficSource.campaign, clientId, if( visitNumber=1,'New Visitor','Returning Visitor') as userTypeGA," \
                    "trafficSource.keyword, t.device.browser as browser, t.device.deviceCategory as deviceCategory, geoNetwork.city," \
                    "count(user.id)  OVER (PARTITION BY user.id  order by user.id desc ) cont_number " \
                    "FROM `{0}.{1}.owoxbi_sessions_{2}` as t where safe_cast( user.id as int64) in UNNEST({3})) " \
                    "select * from a where cont_number=1".format(PROJECT_ID, DATASET_ID, dt.replace('-',''), u_id)
            z = [[row[i] for i in range(11)] for row in queryData(QUERY)]
            df = pd.DataFrame.from_records(z,
                                           columns=[
                                               'date', 'user', 'source',
                                               'Medium', 'campaign',
                                               'clientId', 'userTypeGA',
                                               'keyword', 'browser',
                                               'deviceCategory', 'city'
                                           ])
            gbq.to_gbq(df, 'Mig_Data.temp', projectid,
                       if_exists=import_action)  # отправка данных в GBQ
            QUERY_update = "UPDATE `{0}.Mig_Data.Orders` as a set a.source=t.source, a.Medium=t.Medium, a.campaign=t.campaign, a.clientId = t.clientId, a.userTypeGA =t.userTypeGA, a.keyword=t.keyword," \
                           "a.browser=t.browser, a.device=t.deviceCategory, a.city=t.city " \
                           "from `{0}.Mig_Data.temp` as t " \
                           "where a.date=t.date and kpp=safe_cast(user as int64)".format(PROJECT_ID)
            queryData(QUERY_update, p='через функцию userId')
            table_ref = client.dataset('Mig_Data').table('temp')
            if client.get_table(table_ref):
                client.delete_table(table_ref)
        except:
            continue
Ejemplo n.º 5
0
def fin_obrabotchik(analytics, token):
    primary_data_VK = vk_data_posts(token)
    secondary_data_VK = vk_data_postID(token)
    third_data_GA = GA_Data(analytics, token)

    F_labels = [
        'Id', 'likes', 'reposts', 'comments', 'name', 'date', 'postUrl', 'url'
    ]
    S_labels = ['Id', 'reach_total', 'reach_subscribers', 'links', 'socialNet']
    Th_labels = [
        'Id', 'sessions', 'bounceRate', 'pagePerSession', 'duration',
        'transactions', 'revenue'
    ]
    F_df = pd.DataFrame.from_records(primary_data_VK, columns=F_labels)
    S_df = pd.DataFrame.from_records(secondary_data_VK, columns=S_labels)
    Th_df = pd.DataFrame.from_records(third_data_GA, columns=Th_labels)

    mergedData = pd.merge(F_df, S_df, on=['Id'])
    mergedData_Final = pd.merge(mergedData, Th_df, how='left', on=['Id'])
    mergedData_Final = mergedData_Final[[
        'Id', 'date', 'name', 'reach_total', 'reach_subscribers', 'links',
        'likes', 'reposts', 'comments', 'url', 'sessions', 'bounceRate',
        'pagePerSession', 'duration', 'transactions', 'revenue', 'postUrl',
        'socialNet'
    ]]  # определяем порядок столбцов
    # mergedData_Final.to_csv('111.csv')

    gbq.to_gbq(mergedData_Final,
               f'{DATASET_ID}.{TABLE_ID_Temp}',
               '78997000000',
               if_exists='replace')
Ejemplo n.º 6
0
def insert_row(df, table, replace_val):
    if len(df.index) == 0:
        print("gbq insert records zero")
        return
    project_id = _common_conf['bigquery']['project_id']
    private_key_path = get_abspath(_common_conf['bigquery']['key_path'])
    dataset = _common_conf['bigquery']['dataset']

    # 10000ずつinset
    full_table = "{}.{}".format(dataset, table)
    client = get_client(json_key_file=private_key_path,
                        readonly=False,
                        swallow_results=False)

    if client.check_table(dataset, table):
        bq_limit = 10000
        q_num = len(df.index) // bq_limit
        for i in range(0, q_num + 1):
            client = get_client(json_key_file=private_key_path,
                                readonly=False,
                                swallow_results=False)
            ins_df = df[i * bq_limit:(i + 1) * bq_limit].replace(
                np.nan, replace_val)

            row_dict = ins_df.to_dict(orient='index')
            row_data = [x for x in row_dict.values()]
            ret = client.push_rows(dataset, table, row_data)
            if 'insertErrors' in ret:
                msg = "BigQuery Insert Error:\nsample:\n{}\nerror:\n{}"
                raise Exception(msg.format(row_data[0:5], ret))
    else:
        print('{} CREATE TABLE'.format(full_table))
        gbq.to_gbq(df, full_table, project_id)
Ejemplo n.º 7
0
    def test_upload_data_if_table_exists_replace(self):
        table_name = 'new_test4'

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df,
                   "pydata_pandas_bq_testing." + table_name,
                   PROJECT_ID,
                   chunksize=10000)

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema,
                   "pydata_pandas_bq_testing." + table_name,
                   PROJECT_ID,
                   if_exists='replace')

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq(
            "SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." +
            table_name,
            project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], 5)
Ejemplo n.º 8
0
    def test_upload_new_table(self):
        # Attempting to upload to a new table with valid data and a valid schema should succeed
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest(
                'Skipped because authentication information is not available.')

        schema = [
            'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER',
            'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN',
            'INTEGER', 'STRING', 'INTEGER'
        ]

        array = [[
            'TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999,
            '00.000.00.000', 1, 'hola', 99999999, False, False, 1, 'Jedi',
            11210
        ]]
        df = DataFrame(array,
                       columns=[
                           'title', 'id', 'language', 'wp_namespace',
                           'is_redirect', 'revision_id', 'contributor_ip',
                           'contributor_id', 'contributor_username',
                           'timestamp', 'is_minor', 'is_bot', 'reversion_id',
                           'comment', 'num_characters'
                       ])
        gbq.to_gbq(df,
                   'pandas_testing_dataset.test_data2',
                   schema=schema,
                   col_order=None,
                   if_exists='append')
        a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data2")
        self.assertTrue((a == df).all().all())
Ejemplo n.º 9
0
    def test_upload_bad_data_table(self):
        # Attempting to upload data that does not match schema should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest(
                'Skipped because authentication information is not available.')

        schema = [
            'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER',
            'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN',
            'INTEGER', 'STRING', 'INTEGER'
        ]

        array = [[
            'TESTING_GBQ\',', False, 'hi', 0, True, 'STRING IN INTEGER',
            '00.000.00.000', 1, 'hola', 99999999, -100, 1000, 1, 'Jedi', 11210
        ]]
        df = DataFrame(array,
                       columns=[
                           'title', 'id', 'language', 'wp_namespace',
                           'is_redirect', 'revision_id', 'contributor_ip',
                           'contributor_id', 'contributor_username',
                           'timestamp', 'is_minor', 'is_bot', 'reversion_id',
                           'comment', 'num_characters'
                       ])
        with self.assertRaises(bigquery_client.BigqueryServiceError):
            gbq.to_gbq(df,
                       'pandas_testing_dataset.test_data1',
                       schema=schema,
                       col_order=None,
                       if_exists='append')
Ejemplo n.º 10
0
    def test_upload_data(self):
        test_size = 1000001
        #create df to test for all BQ datatypes except RECORD
        bools = np.random.randint(2, size=(1, test_size)).astype(bool)
        flts = np.random.randn(1, test_size)
        ints = np.random.randint(1, 10, size=(1, test_size))
        strs = np.random.randint(1, 10, size=(1, test_size)).astype(str)
        times = [
            datetime.datetime.now(pytz.timezone('US/Arizona'))
            for t in xrange(test_size)
        ]
        df = DataFrame(
            {
                'bools': bools[0],
                'flts': flts[0],
                'ints': ints[0],
                'strs': strs[0],
                'times': times[0]
            },
            index=range(test_size))
        gbq.to_gbq(df,
                   "pydata_pandas_bq_testing.new_test",
                   project_id=PROJECT_ID,
                   chunksize=10000)
        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq(
            "SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing.new_test",
            project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size)
Ejemplo n.º 11
0
    def test_upload_replace(self):
        # Attempting to overwrite an existing table with valid data and a valid schema should succeed
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest('Skipped because authentication information is not available.')

        schema = ['STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN',
                  'INTEGER', 'STRING', 'INTEGER',
                  'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN',
                  'INTEGER', 'STRING', 'INTEGER']

        # Setup an existing table
        array1 = [['', 1, '', 1, False, 1, '00.111.00.111', 1, 'hola',
                 1, True, True, 1, 'Sith', 1]]
        df1 = DataFrame(array1, columns=['title','id','language','wp_namespace','is_redirect','revision_id',
                                       'contributor_ip','contributor_id','contributor_username','timestamp',
                                       'is_minor','is_bot','reversion_id','comment','num_characters'])
        gbq.to_gbq(df1, 'pandas_testing_dataset.test_data5', schema=schema, col_order=None, if_exists='fail')
        
        array2 = [['TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola',
                 99999999, False, False, 1, 'Jedi', 11210]]

        # Overwrite the existing table with different data
        df2 = DataFrame(array2, columns=['title','id','language','wp_namespace','is_redirect','revision_id',
                                       'contributor_ip','contributor_id','contributor_username','timestamp',
                                       'is_minor','is_bot','reversion_id','comment','num_characters'])
        gbq.to_gbq(df2, 'pandas_testing_dataset.test_data5', schema=schema, col_order=None, if_exists='replace')
        
        # Read the table and confirm the new data is all that is there
        a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data5")
        self.assertTrue((a == df2).all().all())
Ejemplo n.º 12
0
def load_tracks_bq(tracks):
    """Loads json to BigQuery table."""
    from pandas.io import gbq
    from pandas.io.json import json_normalize  #package for flattening json in pandas df

    # Constants
    project_id = 'secret-compass-181513'
    dataset_name = 'spotify'

    dic_flattened = [flatten_json(d) for d in tracks['items']]
    df = pd.DataFrame(dic_flattened)

    # Save tracks in Google BigQuery
    bigquery_client = bigquery.Client(project=project_id)
    dataset = bigquery_client.dataset(dataset_name)

    df_recently_played = df[[
        'track_name', 'played_at', 'track_album_artists_0_name',
        'track_album_name', 'track_type', 'track_uri'
    ]]

    gbq.to_gbq(df_recently_played,
               'spotify.play_history_new',
               project_id,
               if_exists='append')

    return 'dataset loaded to BQ'
Ejemplo n.º 13
0
    def test_upload_replace_schema_error(self):
        # Attempting to replace an existing table without specifying a schema should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest('Skipped because authentication information is not available.')

        df = DataFrame(self.correct_data_small)
        with self.assertRaises(gbq.SchemaMissing):
            gbq.to_gbq(df, 'pandas_testing_dataset.test_database', schema=None, col_order=None, if_exists='replace')
Ejemplo n.º 14
0
    def test_invalid_column_name_schema(self):
        # Specifying a schema that contains an invalid column name should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest('Skipped because authentication information is not available.')

        schema = ['INCORRECT']
        df = DataFrame([[1]],columns=['fake'])
        with self.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df, 'pandas_testing_dataset.test_data', schema=schema, col_order=None, if_exists='append')
Ejemplo n.º 15
0
    def test_invalid_number_of_columns_schema(self):
        # Specifying a schema that does not have same shape as dataframe should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest('Skipped because authentication information is not available.')

        schema = ['INTEGER']
        df = DataFrame([[1, 'STRING']],columns=['fake','fake'])
        with self.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df, 'pandas_testing_dataset.test_data4', schema=schema, col_order=None, if_exists='append')
Ejemplo n.º 16
0
    def test_google_upload_errors_should_raise_exception(self):
        table_name = 'new_test5'

        test_timestamp = datetime.now(pytz.timezone('US/Arizona'))
        bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0], 'ints': [0, '1'], 'strs': ['a', 1],
                            'times': [test_timestamp, test_timestamp]}, index=range(2))

        with tm.assertRaises(gbq.StreamingInsertError):
            gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.' + table_name, PROJECT_ID, verbose=True)
Ejemplo n.º 17
0
    def test_google_upload_errors_should_raise_exception(self):
        destination_table = DESTINATION_TABLE + "5"

        test_timestamp = datetime.now(pytz.timezone('US/Arizona'))
        bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0], 'ints': [0, '1'], 'strs': ['a', 1],
                            'times': [test_timestamp, test_timestamp]}, index=range(2))

        with tm.assertRaises(gbq.StreamingInsertError):
            gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True)
Ejemplo n.º 18
0
 def test_google_upload_errors_should_raise_exception(self):
     test_timestamp = datetime.datetime.now(pytz.timezone('US/Arizona'))
     bad_df = DataFrame( {'bools': [False, False],
                          'flts': [0.0,1.0],
                          'ints': [0,'1'],
                          'strs': ['a', 1],
                          'times': [test_timestamp, test_timestamp]
                          }, index=range(2))
     with tm.assertRaises(gbq.UnknownGBQException):
         gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.new_test', project_id = PROJECT_ID)
def stream_dataToBQ(ins_data):
    # BigQuery params
    PROJECT_ID = '78997000000'
    DATASET_ID = 'Temp'
    TABLE_ID = 'OrderPandasPartit'
    import_action = 'append'
    gbq.to_gbq(ins_data,
               f'{DATASET_ID}.{TABLE_ID}',
               PROJECT_ID,
               if_exists=import_action)
Ejemplo n.º 20
0
    def test_upload_public_data_error(self):
        # Attempting to upload to a public, read-only, dataset should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest('Skipped because authentication information is not available.')

        array = [['TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola',
                 99999999, False, False, 1, 'Jedi', 11210]]
        df = DataFrame(array)
        with self.assertRaises(bigquery_client.BigqueryServiceError):
            gbq.to_gbq(df, 'publicdata:samples.wikipedia', schema=None, col_order=None, if_exists='append')
Ejemplo n.º 21
0
 def test_google_upload_errors_should_raise_exception(self):
     test_timestamp = datetime.datetime.now(pytz.timezone('US/Arizona'))
     bad_df = DataFrame( {'bools': [False, False],
                          'flts': [0.0,1.0],
                          'ints': [0,'1'],
                          'strs': ['a', 1],
                          'times': [test_timestamp, test_timestamp]
                          }, index=range(2))
     with tm.assertRaises(gbq.UnknownGBQException):
         gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.new_test', project_id = PROJECT_ID)
Ejemplo n.º 22
0
    def test_upload_replace(self):
        # Attempting to overwrite an existing table with valid data and a valid schema should succeed
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest(
                'Skipped because authentication information is not available.')

        schema = [
            'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER',
            'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN',
            'INTEGER', 'STRING', 'INTEGER'
        ]

        # Setup an existing table
        array1 = [[
            '', 1, '', 1, False, 1, '00.111.00.111', 1, 'hola', 1, True, True,
            1, 'Sith', 1
        ]]
        df1 = DataFrame(array1,
                        columns=[
                            'title', 'id', 'language', 'wp_namespace',
                            'is_redirect', 'revision_id', 'contributor_ip',
                            'contributor_id', 'contributor_username',
                            'timestamp', 'is_minor', 'is_bot', 'reversion_id',
                            'comment', 'num_characters'
                        ])
        gbq.to_gbq(df1,
                   'pandas_testing_dataset.test_data5',
                   schema=schema,
                   col_order=None,
                   if_exists='fail')

        array2 = [[
            'TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999,
            '00.000.00.000', 1, 'hola', 99999999, False, False, 1, 'Jedi',
            11210
        ]]

        # Overwrite the existing table with different data
        df2 = DataFrame(array2,
                        columns=[
                            'title', 'id', 'language', 'wp_namespace',
                            'is_redirect', 'revision_id', 'contributor_ip',
                            'contributor_id', 'contributor_username',
                            'timestamp', 'is_minor', 'is_bot', 'reversion_id',
                            'comment', 'num_characters'
                        ])
        gbq.to_gbq(df2,
                   'pandas_testing_dataset.test_data5',
                   schema=schema,
                   col_order=None,
                   if_exists='replace')

        # Read the table and confirm the new data is all that is there
        a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data5")
        self.assertTrue((a == df2).all().all())
Ejemplo n.º 23
0
def upload_data_to_gbq(data, name_table):
    try:
        gbq.to_gbq(data,
                   name_table,
                   project_id='de-exam-kittisak',
                   if_exists='append')
        message = "status:200 Success send data to gbq"
        print(message)
        return message
    except Exception as e:
        raise e
Ejemplo n.º 24
0
    def test_upload_data(self):
        table_name = 'new_test1'

        test_size = 1000001
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000)

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size)
Ejemplo n.º 25
0
def dataToGBQ(analytics, token, sdate):
    z = [fin_obrabotchik(analytics, token, sdate)]

    labels = [
        'date', 'reach', 'reach_subscribers', 'comments', 'likes',
        'subscribed', 'unsubscribed', 'copies', 'sessions', 'bounceRate',
        'pageviews', 'duration', 'transactions', 'revenue', 'socialNet',
        'members'
    ]  # порядок столбцов

    df = pd.DataFrame.from_records(z, columns=labels)
    gbq.to_gbq(df, 'Temp.Social', '78997000000', if_exists='append')
Ejemplo n.º 26
0
    def test_upload_data(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 1000001
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID)
        self.assertEqual(result["NUM_ROWS"][0], test_size)
Ejemplo n.º 27
0
    def test_upload_data(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 1000001
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
                              project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size)
Ejemplo n.º 28
0
    def test_upload_replace_schema_error(self):
        # Attempting to replace an existing table without specifying a schema should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest(
                'Skipped because authentication information is not available.')

        df = DataFrame(self.correct_data_small)
        with self.assertRaises(gbq.SchemaMissing):
            gbq.to_gbq(df,
                       'pandas_testing_dataset.test_database',
                       schema=None,
                       col_order=None,
                       if_exists='replace')
Ejemplo n.º 29
0
def sql_db_select():
    with open(file_db_connect) as f:
        param_сonnect = json.load(f)
    db_connect = MySQLdb.connect(user=param_сonnect['user'], passwd=param_сonnect['passwd'],
                                 host=param_сonnect['host'],  charset='cp1251')

    sql_data = "select distinct z.ZAKAZ_ID,ifnull(purchase_group_id,-1) as group_id from `ukt_sess`.`ZAK` z " \
                 "left join `ukt_sess`.`ZAK_ITEMS` zi on (zi.zakaz_id=z.zakaz_id) " \
                 "left join `ukt_prod`.`ukt_purch_group_goods` gg on (gg.goods_id=zi.item_id) " \
                 "where DATE(z.DATA) between '2019-01-01' and '2019-04-30' and z.date_ans is not NULL and z.ANS is not NULL and z.STATUS>0 and z.archive<>1"

    df_mysql = pd.read_sql(sql_data, con=db_connect)
    gbq.to_gbq(df_mysql, f'Temp.QQ', '78997000000', if_exists='append')  # отправка данных в GBQ
Ejemplo n.º 30
0
    def test_upload_data_if_table_exists_fail(self):
        destination_table = DESTINATION_TABLE + "2"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.table.create(TABLE_ID + "2", gbq._generate_bq_schema(df))

        # Test the default value of if_exists is 'fail'
        with tm.assertRaises(gbq.TableCreationError):
            gbq.to_gbq(df, destination_table, PROJECT_ID)

        # Test the if_exists parameter with value 'fail'
        with tm.assertRaises(gbq.TableCreationError):
            gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='fail')
Ejemplo n.º 31
0
def dataToGBQ():
    df = pd.read_csv(
        r'C:\Python\ukt\txtcsvFile\appsflyers\organic-in-app-events_2019-01-01_2019-01-31.csv'
    )
    z = len(df)
    df.columns = [i.replace(' ', '_') for i in list(df.columns.values)]
    # или так df.columns=df.columns.str.replace(' ','_')
    df = df.drop_duplicates()
    gbq.to_gbq(df, f'{DATASET_ID}.{TABLE}', projectid,
               if_exists=import_action)  # отправка данных в GBQ

    print(
        'Loaded {} row into {}. Начальный файл: {}строк. Было дубликатов - {} '
        .format(len(df), TABLE, z, z - len(df)))
Ejemplo n.º 32
0
    def test_upload_data_flexible_column_order(self):
        destination_table = DESTINATION_TABLE + "13"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
                   private_key=_get_private_key_path())

        df_columns_reversed = df[df.columns[::-1]]

        gbq.to_gbq(df_columns_reversed, destination_table, _get_project_id(),
                   if_exists='append', private_key=_get_private_key_path())
Ejemplo n.º 33
0
    def test_invalid_number_of_columns_schema(self):
        # Specifying a schema that does not have same shape as dataframe should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest(
                'Skipped because authentication information is not available.')

        schema = ['INTEGER']
        df = DataFrame([[1, 'STRING']], columns=['fake', 'fake'])
        with self.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df,
                       'pandas_testing_dataset.test_data4',
                       schema=schema,
                       col_order=None,
                       if_exists='append')
Ejemplo n.º 34
0
    def test_invalid_column_name_schema(self):
        # Specifying a schema that contains an invalid column name should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest(
                'Skipped because authentication information is not available.')

        schema = ['INCORRECT']
        df = DataFrame([[1]], columns=['fake'])
        with self.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df,
                       'pandas_testing_dataset.test_data',
                       schema=schema,
                       col_order=None,
                       if_exists='append')
Ejemplo n.º 35
0
    def test_upload_data_if_table_exists_fail(self):
        destination_table = DESTINATION_TABLE + "2"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.table.create(TABLE_ID + "2", gbq._generate_bq_schema(df))

        # Test the default value of if_exists is 'fail'
        with tm.assertRaises(gbq.TableCreationError):
            gbq.to_gbq(df, destination_table, PROJECT_ID)

        # Test the if_exists parameter with value 'fail'
        with tm.assertRaises(gbq.TableCreationError):
            gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='fail')
Ejemplo n.º 36
0
    def test_upload_data(self):
        test_size = 1000001
        #create df to test for all BQ datatypes except RECORD
        bools = np.random.randint(2, size=(1,test_size)).astype(bool)
        flts = np.random.randn(1,test_size)
        ints = np.random.randint(1,10, size=(1,test_size))
        strs = np.random.randint(1,10, size=(1,test_size)).astype(str)
        times = [datetime.datetime.now(pytz.timezone('US/Arizona')) for t in xrange(test_size)]
        df = DataFrame({'bools':bools[0], 'flts':flts[0], 'ints':ints[0], 'strs':strs[0], 'times':times[0]}, index=range(test_size))
        gbq.to_gbq(df,"pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID, chunksize=10000)
        sleep(60) # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size)
Ejemplo n.º 37
0
    def test_upload_data_if_table_exists_fail(self):
        table_name = 'new_test2'

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        gbq.create_table('pydata_pandas_bq_testing.' + table_name, gbq.generate_bq_schema(df), PROJECT_ID)

        # Test the default value of if_exists is 'fail'
        with tm.assertRaises(gbq.TableCreationError):
            gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID)

        # Test the if_exists parameter with value 'fail'
        with tm.assertRaises(gbq.TableCreationError):
            gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='fail')
    def load_df_to_bq_tmp(self, df, tmp_table):
        '''
        This function inserts the provided dataframe into a temp table in BigQuery, which
        is used in other parts of this class (e.g. L1 and L2 expansions) to join on by
        patent number.
        '''
        print('Loading dataframe with cols {}, shape {}, to {}'.format(
            df.columns, df.shape, tmp_table))
        gbq.to_gbq(dataframe=df,
                   destination_table=tmp_table,
                   project_id=self.bq_project,
                   if_exists='replace',
                   verbose=False)

        print('Completed loading temp table.')
Ejemplo n.º 39
0
def DfFromSQL():
    with open(file_db_connect) as f:
        param_сonnect = json.load(f)
    db_connect = MySQLdb.connect(user=param_сonnect['user'],
                                 passwd=param_сonnect['passwd'],
                                 host=param_сonnect['host'],
                                 db=param_сonnect['db_sess'],
                                 charset='cp1251')

    sql_query = "SELECT master_order_id,flag_cancel,pay_sum FROM ZAKAZ " \
                "where Date(created) between '{}' and '{}' and archive=0".format(DateStart, DateStop)
    df_mysql = pd.read_sql(sql_query, con=db_connect)
    db_connect.close()
    gbq.to_gbq(df_mysql, f'Mig_Data.Lud', '78997000000',
               if_exists='replace')  # отправка данных в GBQ
Ejemplo n.º 40
0
    def test_upload_data(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
                   private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=_get_project_id(),
                              private_key=_get_private_key_path())
        self.assertEqual(result['NUM_ROWS'][0], test_size)
Ejemplo n.º 41
0
    def test_upload_data_as_service_account_with_key_contents(self):
        destination_table = "{0}.{1}".format(DATASET_ID + "3", TABLE_ID + "1")

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
                   private_key=_get_private_key_contents())

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq(
            "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
            project_id=_get_project_id(),
            private_key=_get_private_key_contents())
        self.assertEqual(result['NUM_ROWS'][0], test_size)
Ejemplo n.º 42
0
    def test_upload_data_as_service_account_with_key_contents(self):
        destination_table = DESTINATION_TABLE + "12"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000,
                   private_key=PRIVATE_KEY_JSON_CONTENTS)

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq(
            "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
            project_id=PROJECT_ID,
            private_key=PRIVATE_KEY_JSON_CONTENTS)
        self.assertEqual(result['NUM_ROWS'][0], test_size)
Ejemplo n.º 43
0
    def test_google_upload_errors_should_raise_exception(self):
        destination_table = DESTINATION_TABLE + "5"

        test_timestamp = datetime.now(pytz.timezone("US/Arizona"))
        bad_df = DataFrame(
            {
                "bools": [False, False],
                "flts": [0.0, 1.0],
                "ints": [0, "1"],
                "strs": ["a", 1],
                "times": [test_timestamp, test_timestamp],
            },
            index=range(2),
        )

        with tm.assertRaises(gbq.StreamingInsertError):
            gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True)
Ejemplo n.º 44
0
    def test_upload_bad_data_table(self):
        # Attempting to upload data that does not match schema should fail
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest('Skipped because authentication information is not available.')

        schema = ['STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN',
                  'INTEGER', 'STRING', 'INTEGER',
                  'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN',
                  'INTEGER', 'STRING', 'INTEGER']

        array = [['TESTING_GBQ\',', False, 'hi', 0, True, 'STRING IN INTEGER', '00.000.00.000', 1, 'hola',
                 99999999, -100, 1000, 1, 'Jedi', 11210]]
        df = DataFrame(array, columns=['title','id','language','wp_namespace','is_redirect','revision_id',
                                       'contributor_ip','contributor_id','contributor_username','timestamp',
                                       'is_minor','is_bot','reversion_id','comment','num_characters'])
        with self.assertRaises(bigquery_client.BigqueryServiceError):
            gbq.to_gbq(df, 'pandas_testing_dataset.test_data1', schema=schema, col_order=None, if_exists='append')
Ejemplo n.º 45
0
    def test_upload_data_if_table_exists_replace(self):
        destination_table = DESTINATION_TABLE + "4"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='replace')

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], 5)
Ejemplo n.º 46
0
    def test_upload_data_if_table_exists_replace(self):
        table_name = 'new_test4'

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='replace')

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], 5)
Ejemplo n.º 47
0
    def test_upload_new_table(self):
        # Attempting to upload to a new table with valid data and a valid schema should succeed
        if not os.path.exists(self.bq_token):
            raise nose.SkipTest('Skipped because authentication information is not available.')

        schema = ['STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN',
                  'INTEGER', 'STRING', 'INTEGER',
                  'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN',
                  'INTEGER', 'STRING', 'INTEGER']

        array = [['TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola',
                 99999999, False, False, 1, 'Jedi', 11210]]
        df = DataFrame(array, columns=['title','id','language','wp_namespace','is_redirect','revision_id',
                                       'contributor_ip','contributor_id','contributor_username','timestamp',
                                       'is_minor','is_bot','reversion_id','comment','num_characters'])
        gbq.to_gbq(df, 'pandas_testing_dataset.test_data2', schema=schema, col_order=None, if_exists='append')
        a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data2")
        self.assertTrue((a == df).all().all())
Ejemplo n.º 48
0
    def test_upload_data_as_service_account_with_key_contents(self):
        raise nose.SkipTest(
            "flaky test")

        destination_table = DESTINATION_TABLE + "12"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
                   private_key=_get_private_key_contents())

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq(
            "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
            project_id=_get_project_id(),
            private_key=_get_private_key_contents())
        self.assertEqual(result['NUM_ROWS'][0], test_size)
Ejemplo n.º 49
0
    def test_upload_data_if_table_exists_append(self):
        destination_table = DESTINATION_TABLE + "3"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
                   private_key=_get_private_key_path())

        # Test the if_exists parameter with value 'append'
        gbq.to_gbq(df, destination_table, _get_project_id(),
                   if_exists='append', private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=_get_project_id(),
                              private_key=_get_private_key_path())
        self.assertEqual(result['NUM_ROWS'][0], test_size * 2)

        # Try inserting with a different schema, confirm failure
        with tm.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df_different_schema, destination_table,
                       _get_project_id(), if_exists='append',
                       private_key=_get_private_key_path())
Ejemplo n.º 50
0
    def test_upload_data_if_table_exists_append(self):
        table_name = 'new_test3'

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with value 'append'
        gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='append')

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size * 2)

        # Try inserting with a different schema, confirm failure
        with tm.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='append')
Ejemplo n.º 51
0
 def test_to_gbq_should_fail_if_invalid_table_name_passed(self):
     with tm.assertRaises(gbq.NotFoundException):
         gbq.to_gbq(DataFrame(), 'invalid_table_name', project_id="1234")
Ejemplo n.º 52
0
 def test_to_gbq_with_no_project_id_given_should_fail(self):
     with tm.assertRaises(TypeError):
         gbq.to_gbq(DataFrame(), 'dataset.tablename')