Ejemplo n.º 1
0
def read_gbq(query, project_id=None, index_col=None, col_order=None,
             reauth=False, verbose=True, private_key=None, dialect='legacy',
             **kwargs):
    pandas_gbq = _try_import()
    return pandas_gbq.read_gbq(
        query, project_id=project_id,
        index_col=index_col, col_order=col_order,
        reauth=reauth, verbose=verbose,
        private_key=private_key,
        dialect=dialect,
        **kwargs)
Ejemplo n.º 2
0
    def loadImages(self) -> pd.DataFrame:
        """Query All the images from the BigQuery Table

         Returns:
             Returns a DataFrame with all the images
         """

        df = pandas_gbq.read_gbq(
            'SELECT * FROM `sidhouses.osm_data.maps_images`',
            project_id='sidhouses')
        return df
Ejemplo n.º 3
0
def read_sql(query, billing_project_id=None, from_file=False, reauth=False):
    """Load data from BigQuery using a query. Just a wrapper around pandas.read_gbq

    Args:
        query (sql):
            Valid SQL Standard Query to basedosdados
        billing_project_id (str): Optional.
            Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard
        reauth (boolean): Optional.
            Re-authorize Google Cloud Project in case you need to change user or reset configurations.

    Returns:
        pd.DataFrame:
            Query result
    """

    try:
        return pandas_gbq.read_gbq(
            query,
            credentials=credentials(from_file=from_file, reauth=reauth),
            project_id=billing_project_id,
        )
    except (OSError, ValueError):
        raise BaseDosDadosException(
            "\nWe are not sure which Google Cloud project should be billed.\n"
            "First, you should make sure that you have a Google Cloud project.\n"
            "If you don't have one, set one up following these steps: \n"
            "\t1. Go to this link https://console.cloud.google.com/projectselector2/home/dashboard\n"
            "\t2. Agree with Terms of Service if asked\n"
            "\t3. Click in Create Project\n"
            "\t4. Put a cool name in your project\n"
            "\t5. Hit create\n"
            ""
            "Copy the Project ID, (notice that it is not the Project Name)\n"
            "Now, you have two options:\n"
            "1. Add an argument to your function poiting to the billing project id.\n"
            "   Like `bd.read_table('br_ibge_pib', 'municipios', billing_project_id=<YOUR_PROJECT_ID>)`\n"
            "2. You can set a project_id in the environment by running the following command in your terminal: `gcloud config set project <YOUR_PROJECT_ID>`."
            "   Bear in mind that you need `gcloud` installed.")
    except GenericGBQException as e:
        if "Reason: 403" in str(e):
            raise BaseDosDadosException(
                "\nYou still don't have a Google Cloud Project.\n"
                "Set one up following these steps: \n"
                "1. Go to this link https://console.cloud.google.com/projectselector2/home/dashboard\n"
                "2. Agree with Terms of Service if asked\n"
                "3. Click in Create Project\n"
                "4. Put a cool name in your project\n"
                "5. Hit create\n"
                "6. Rerun this command with the flag `reauth=True`. \n"
                "   Like `read_table('br_ibge_pib', 'municipios', reauth=True)`"
            )
        else:
            raise e
Ejemplo n.º 4
0
def search_in_user_vk_library(username,word='',mode='post',project_id=project_id,credentials=credentials):
    try:
        if mode=='post_from_group':
            group_name,word=word.split(',')

            word = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", word)
            words = word.lower().split()
            words = [w for w in words if not w in stops]
            words = [stemmer.stem(w) for w in words]
            if words=='':
                return "некорректный ввод"

            Query = f'SELECT * FROM dataset.vk_storage_{username}  WHERE group_name=\'{group_name}\' and (post LIKE \''
            for word in words:
                Query+='%{}'.format(word)
            Query +='%\' or post LIKE \' '
            flag=True
            for word in words:
                if flag==True:
                    Query+='%{}'.format(word.capitalize())
                    flag=False
                else:
                    Query+='%{}'.format(word)
            Query +='%\')'

        word = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", word)
        words = word.lower().split()
        words = [w for w in words if not w in stops]
        words = [stemmer.stem(w) for w in words]
        if words=='':
            return "некорректный ввод"

        if mode=='post':
            Query = f'SELECT * FROM dataset.vk_storage_{username}  WHERE post LIKE \''
            for word in words:
                Query+='%{}'.format(word)
            Query +='%\' or post LIKE \' '
            flag=True
            for word in words:
                if flag==True:
                    Query+='%{}'.format(word.capitalize())
                    flag=False
                else:
                    Query+='%{}'.format(word)
            Query +='%\''

        print(Query)
        df = gbq.read_gbq(Query, project_id, credentials=credentials)

        result = df.values.tolist()

        return (result)
    except:
        return []
Ejemplo n.º 5
0
    def rawQuery(self, sql):
        """this will send the sql to BQ and return the results

        Args:
            sql(str): the sql string you care about

        Returns:
            DataFrame: a pandas.DataFrame of the results
        """
        df = pandas_gbq.read_gbq(sql)
        return df
Ejemplo n.º 6
0
def login():
    if request.method == 'GET':
        return render_template('login.html')
    else:
        email = request.form.get('email')
        password = request.form.get('password')
        print(email, password)
        SQL = """
            SELECT password
            FROM `movie.users`
            WHERE email='%s'
            """ % (email)
        try:
            df = pandas_gbq.read_gbq(SQL)
            print('success')
            print(df)
            if len(df) > 0:
                if df.iloc[0].password == password:
                    #print('iam here')
                    cur_user = UserMixin()
                    cur_user.id = email
                    login_user(cur_user)
                    session['user_email'] = email
                    SQL = """
                            SELECT uid
                            FROM `movie.users`
                            WHERE email='%s'
                            """ % (email)
                    df = pandas_gbq.read_gbq(SQL)
                    session['user_id'] = int(df.iloc[0].uid)
                    session.permenant = True
                    return redirect(url_for('index'))
                else:
                    error = 'email or password is wrong, try again'
                    return render_template('login.html', error=error)
            else:
                error = 'user does not exist'
                return render_template('login.html', error=error)
        except:
            error = 'something wrong try again'
            return render_template('login.html', error=error)
Ejemplo n.º 7
0
    def query(self,
              query: str,
              chunksize: int,
              exact=False) -> Generator[DataFrame, None, None]:
        """
        Query DAO repo and returns a generator of DataFrames with query results.

        Keyword Arguments:
            query {str} -- Query string
            chunksize {int} -- Number of rows of dataframe per chunk
            exact {bool} -- If false, query orders results and returns chunks

        Returns:
            Generator[DataFrame] -- Generator to iterate over DataFrame results.
        """
        if query is None:
            raise DaoError
        try:
            if exact:
                self.LOGGER.info(f"Querying {self.tablename}: {query}")
                response_df = pandas_gbq.read_gbq(query,
                                                  progress_bar_type=None)
                yield response_df
            if not exact:
                offset = 0
                # Remove semicolon if exists in original query to add ordering to query
                query = query.strip(";")
                while True:
                    add_query = (
                        f" ORDER BY {self._pkey} LIMIT {chunksize} OFFSET {offset};"
                    )
                    gbq_query = query + add_query
                    self.LOGGER.info(f"Querying {self.tablename}: {gbq_query}")
                    response_df = pandas_gbq.read_gbq(gbq_query)
                    if response_df.empty:
                        return
                    offset += chunksize
                    yield response_df
        except Exception as gbq_exp:
            self.LOGGER.exception(str(gbq_exp))
            raise DaoError(gbq_exp)
Ejemplo n.º 8
0
    def loadVillages(self) -> pd.DataFrame:
        """Query All the villages from the BigQuery Table
            Generating the Polygons and Points

         Returns:
             Returns a DataFrame with all the villages
         """

        df = pandas_gbq.read_gbq('SELECT * FROM sidhouses.osm_data.villages')
        self.create_polygons(df)
        self.create_points(df)
        return df
def author_relationship(name):
    pandas_gbq.context.credentials = credentials
    pandas_gbq.context.project = "bigdata-259800"

    nodes_info = {}
    nodes, edges = [], []
    p_name = name.split(' ')
    name = ''.join(['%' + i for i in p_name]) + '%'
    # print(name)

    SQL = f"SELECT * FROM `bigdata-259800.authorinfo.author_edges` where lower(source) like '%{name.lower()}%' limit 200"
    df = pandas_gbq.read_gbq(SQL)

    # print(df)
    au_info = df.source[0].split('-')
    source_node = au_info[0]
    try:
        contact_info = ''.join(au_info[1:])
    except:
        contact_info = ''

    nodes_info[source_node] = {'id': 0, 'num': 0, 'info': contact_info}

    error_symbol = ['none', '@', 'and', 'or']

    for i in range(len(df.target)):
        current_node = df.target[i]
        split_info = current_node.split('-')
        node_name = split_info[0]

        if True in [True for i in error_symbol if i in node_name.lower()]: continue
        if len(split_info) > 1:
            c_info = ' '.join(split_info[1:])
        else:
            c_info = ''

        try:
            nodes_info[node_name]['num'] += 1
        except:
            nodes_info[node_name] = {'id': i+1, 'num': 1, 'info':c_info}

    for i in nodes_info.keys():
        if i == source_node:
            title = 'Contact info: ' + nodes_info[i]['info'] + '\n;' + '# Friends: ' + str(len(nodes_info.keys()) - 1)
        else:
            title = 'Contact info: ' + nodes_info[i]['info']

        nodes.append({'id':nodes_info[i]['id'], 'label': i, 'title': title})

        if i != source_node:
            edges.append({'from': 0, 'to': nodes_info[i]['id'], 'width': nodes_info[i]['num'],
                          'label': str(nodes_info[i]['num']), 'font': {'size': 0}})
    return nodes, edges
Ejemplo n.º 10
0
def has_tweet_id(tweet_id):
    q = """
    SELECT og_tweet_id
    FROM twitter.tweets
    WHERE og_tweet_id = {}
    """.format(tweet_id)
    df = pandas_gbq.read_gbq(q, project_id=project_id, credentials=credentials)
    if df.shape[0] >= 1:
        return True
    else:
        return False
    return q
Ejemplo n.º 11
0
def getFreshData(ProjectId):
    bigquery_sql = " ".join([
        "SELECT id, DATE(CAST(created_at AS DATETIME)) AS created, DATE(CAST(updated_at AS DATETIME)) AS updated, status, assignee_id",
        "FROM `xsolla_summer_school.customer_support`",
        "WHERE status IN ('closed','solved')", "ORDER BY updated_at"
    ])

    dataframe = pandas_gbq.read_gbq(bigquery_sql,
                                    project_id=ProjectId,
                                    dialect="standard")

    return dataframe
Ejemplo n.º 12
0
def read_gbq(query, 
            project_id='robusta-lab', 
            **kwargs):
    """
    write a dataframe in Google BigQuery
    """

    return pandas_gbq.read_gbq(
        query,
        project_id,
        credentials=_get_credentials_gbq(),
        **kwargs)
Ejemplo n.º 13
0
def bq_python(ds, **ags):
    #pull the data from previous task
    credentials = ags['task_instance'].xcom_pull(task_ids='bq_connection')
    #conditional: if not exist, select all, else: select just the newest (check to the bq)
    query = "select distinct order_id, user_id\
            FROM `acube_2019.acube_fintech_final_project_2019`\
                limit 10"

    constellation = pandas_gbq.read_gbq(query=query,
                                        project_id='minerva-da-coe',
                                        credentials=credentials)
    print(constellation.head())
    return constellation
def connection(request):
    pandas_gbq.context.credentials = credentials
    pandas_gbq.context.project = "Your-Project"
    SQL1 = ''
    df1 = pandas_gbq.read_gbq(SQL1)

    SQL2 = ''
    df2 = pandas_gbq.read_gbq(SQL2)

    data = {}
    '''
        TODO: Finish the SQL to query the data, it should be limited to 8 rows. 
        Then process them to format below:
        Format of data:
        {'n': [xxx, xxx, xxx, xxx],
         'e': [{'source': xxx, 'target': xxx},
                {'source': xxx, 'target': xxx},
                ...
                ]
        }
    '''
    return render(request, 'connection.html', data)
 def create_bq(self):
     for dset in self.dsets:
         df = pandas_gbq.read_gbq("""SELECT *
                                 FROM jhu_covid_dset.{}
                                 """.format(dset))
         df.drop(['province_state', 'lat', 'long'], axis=1, inplace=True)
         df = df.groupby(['country_region']).sum().T
         df = self.clean_data(df)
         # df.drop(df.tail(1).index, inplace=True)
         pandas_gbq.to_gbq(df,
                           'torran_covid_dset.{}'.format(dset),
                           if_exists='replace')
     return
Ejemplo n.º 16
0
    def loadOSMBuildings(self) -> pd.DataFrame:
        """Query All the buildings from the BigQuery Table
            Generating the Polygons and Points

         Returns:
             Returns a DataFrame with all the buildings
         """

        df = pandas_gbq.read_gbq('SELECT * FROM sidhouses.osm_data.buildings',
                                 project_id='sidhouses')
        # self.create_polygons(df)
        self.create_points(df)
        return df
Ejemplo n.º 17
0
    def gbq_query(self, url, str_args):
        """Queries a GBQ table and returns the output.

        Args:
            url: A string that contains a URL.
            str_args: A tuple of arguments to pass into an SQL query.

        Returns:
            A pandas data frame.
        """

        start = datetime.now()
        print('Started processing query: {}'.format(start))
        query = requests.get(url, allow_redirects=True).text.format(*str_args)

        try:
            results = pandas_gbq.read_gbq(query,
                                          dialect='standard',
                                          project_id=self.project,
                                          credentials=self.auth2)

        except pandas_gbq.exceptions.AccessDenied:
            self.get_authorization()

            results = pandas_gbq.read_gbq(query,
                                          dialect='standard',
                                          project_id=self.project,
                                          credentials=self.auth2)

        finish = datetime.now()
        print("Finished processing query: {}".format(finish))

        duration = finish - start
        time_diff = round(duration.total_seconds(), 2)
        print('Query returned: {0} results in {1} seconds \n'.format(
            len(results), time_diff))

        return results
Ejemplo n.º 18
0
def update_tables(project_id: str,
                  new_source_tableid: str = 'raw',
                  destination_tableid: str = 'cities_refined',
                  operation_func=operation_refine_city_data_appendbq):
    raw = pandas_gbq.read_gbq(f"""
            SELECT *
            FROM `{project_id}.{config['dataset_id']}.{new_source_tableid}`
            WHERE CAST(datetime as DATE) > DATE_SUB(CURRENT_DATE(), INTERVAL 60 DAY)
            """)
    hist = pandas_gbq.read_gbq(f"""
            SELECT *
            FROM `{project_id}.{config['dataset_id']}.{destination_tableid}`
            WHERE CAST(datetime as DATE) > DATE_SUB(CURRENT_DATE(), INTERVAL 60 DAY)
            """)
    newly_arrived = filter_newly_arrived(raw, hist)
    if newly_arrived is not None:
        operation_func(project_id=project_id,
                       destination_tableid=destination_tableid,
                       newly_arrived=newly_arrived)
    else:
        print(
            f"0 rows added to table: {config['dataset_id']}.{destination_tableid}"
        )
Ejemplo n.º 19
0
def update_geodemo_chart(activity_selected, state_name, county_name):
    sql = """
        SELECT county_fips_code, prediction_date, new_confirmed, new_confirmed_ground_truth
        FROM `bigquery-public-data.covid19_public_forecasts.county_14d` 
        WHERE state_name="{state_name}"  AND county_name="{county_name}"
        """.format(state_name=state_name, county_name=county_name)
    forecast_df = pandas_gbq.read_gbq(sql, project_id=project_id)
    county_fips_code = forecast_df['county_fips_code'][0]
    county_pop = int(df_population_2019[df_population_2019['fips'] ==
                                        county_fips_code]['population'])
    try:
        return make_original_property_graph(activity_selected, county_pop,
                                            forecast_df)
    except:
        return None
def load_t0_from_bq(area, project_id):
    start_time = time.time()

    summary_sql = """
    SELECT distinct section
    FROM `ETL.root_sku`
    WHERE area = "%s"   """ %(area)

    for i in tqdm(range(1), desc='Loading table...'):
        section_table = pandas_gbq.read_gbq(summary_sql, project_id=project_id)

    total_time = round((time.time() - start_time) / 60, 1)
    logger.info("Completed loading of distinct sections table from Bigquery {a} mins...".format(a=total_time))

    return section_table
def load_t1_from_bq(project_id):
    start_time = time.time()

    summary_sql = """
    SELECT *
    FROM `prediction_results.post_prediction_train_input`
    """

    for i in tqdm(range(1), desc='Loading table...'):
        hist_promo_table = pandas_gbq.read_gbq(summary_sql, project_id=project_id)

    total_time = round((time.time() - start_time) / 60, 1)
    logger.info("Completed loading of historical post promotion table from Bigquery {a} mins...".format(a=total_time))

    return hist_promo_table
Ejemplo n.º 22
0
def query_data(end_date, user_type, output_path):

    start_date = end_date - timedelta(days=27)
    start_date = start_date.strftime('%Y-%m-%d')
    end_date = end_date.strftime('%Y-%m-%d')
    query = f'''
            WITH OLD_USER AS(
                (SELECT user_id, TYPE
                FROM `ntufbdata.user_type.user_entering_type`
                WHERE TYPE = '{user_type}')
		UNION DISTINCT
		(SELECT user_id, TYPE
                FROM `ntufbdata.user_type.user_entering_type`
                WHERE TYPE = 'WHOLE')

            ), REACTION AS(
                (SELECT user_id,
                        SPLIT(post_id, '_')[ORDINAL(1)] AS page_id,
                        post_id
                FROM `ntufbdata.USdata.1000_page_us_user_like_post_201501_to_201611_all`
                WHERE TIMESTAMP(post_created_date_CT) >= TIMESTAMP('{start_date}')
                AND TIMESTAMP(post_created_date_CT) <= TIMESTAMP('{end_date}'))
                UNION DISTINCT
                (SELECT user_id,
                        SPLIT(post_id, '_')[ORDINAL(1)] AS page_id,
                        post_id
                FROM `ntufbdata.USdata.politician_us_user_post_like_all`
                WHERE TIMESTAMP(post_created_date_CT) >= TIMESTAMP('{start_date}')
                AND TIMESTAMP(post_created_date_CT) <= TIMESTAMP('{end_date}'))
            )

            SELECT user_id,
		    TYPE,
                    STRING_AGG(page_id, ',') AS like_pages,
                    STRING_AGG(CAST(like_time AS STRING), ',') AS like_times
            FROM(
            SELECT OLD_USER.user_id,
		    OLD_USER.TYPE,
                    REACTION.page_id,
                    COUNT(*) AS like_time
            FROM OLD_USER
            INNER JOIN REACTION ON OLD_USER.user_id = REACTION.user_id
            GROUP BY OLD_USER.user_id, OLD_USER.TYPE, REACTION.page_id)
            GROUP BY user_id, TYPE
            '''

    user_like_pages = gbq.read_gbq(query, project_id='ntufbdata')
    user_like_pages.to_csv(f'{output_path}{end_date}.csv', index=False)
Ejemplo n.º 23
0
    def query_to_df(self, sql_query):
        """Query sql_query and return results in pandas dataframeself.

        Parameters
        ----------
        sql_query: string

        Returns
        -------
        df: pandas.DataFrame

        """
        df = pandas_gbq.read_gbq(sql_query,
                                 project_id=self.project_id,
                                 credentials=self.credentials)
        return df
Ejemplo n.º 24
0
def update_heatmap(state_name, county_name):
    sql = """
        SELECT county_fips_code, prediction_date, new_confirmed, new_confirmed_ground_truth
        FROM `bigquery-public-data.covid19_public_forecasts.county_14d` 
        WHERE state_name="{state_name}"  AND county_name="{county_name}"
        """.format(state_name=state_name, county_name=county_name)
    forecast_df = pandas_gbq.read_gbq(sql, project_id=project_id)
    county_fips_code = forecast_df['county_fips_code'][0]
    county_pop = int(df_population_2019[df_population_2019['fips'] ==
                                        county_fips_code]['population'])

    # Return to original hm(no colored annotation) by resetting
    try:
        return generate_forecast_heatmap(county_pop, forecast_df)
    except:
        return None
Ejemplo n.º 25
0
 def load_data(self):
     # sql = '''
     #         SELECT l.increment_id order_code
     #                                 , max(ward_sellerboom_score) ward_sellerboom_score
     #                                 , max(ward_reseller_score) ward_reseller_score
     #                                 , max(ward_other_score) ward_other_score
     #                                 , s.* except(order_code, ward_sellerboom_score, ward_reseller_score, ward_other_score)
     #         FROM `tiki-dwh.sherlock.fraud_label_2020405` l
     #         LEFT JOIN `tiki-dwh.sherlock.feature_summary_*` s 
     #           ON cast(l.increment_id as string) = s.order_code
     #         GROUP BY 1, 5, 6, 7, 8, 9,10, 11, 12, 13, 14, 15, 16, 17, 18,19,20,21,22,23,24,25,26,27,28,29
     #                 ,30,31,32,33,34,35,36,37,38,39,40,41,42, 43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
     #         '''
     sql = 'select * from `tiki-dwh.consumer_product.fraud_raw_data`'
     raw = pandas_gbq.read_gbq(sql, project_id='tiki-dwh', credentials=credentials)
     return raw
Ejemplo n.º 26
0
    def query(self, sql_query, show_progress=False):
        """
        Run BigTable queries on this dataset. Tables can be referenced in FROM and JOIN SQL clauses
        using their .full_name attributes injected into a query template.
        """
        if show_progress:
            progress_bar_type = 'tqdm'
        else:
            progress_bar_type = None

        df = pandas_gbq.read_gbq(
            sql_query,
            project_id=self.project_id,
            progress_bar_type=progress_bar_type,
        )
        return df
Ejemplo n.º 27
0
    def get_dataframe_from_table(self, project, _dataset, _table):
        # TODO: Adicionar as colunas como parametro para buscar
        #columns = "PARTNER_ID, PRODUCT_ID, MANUFACTURER_ID, CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4, NAME, INTRODUCED_DATE, RETIRED_DATE, UNIT, BRAND, PACKAGE_SIZE, PACKAGE_UNIT, PRIVATE_LABEL_FLAG, GTIN"
        columns = "*"
        dtset = _dataset
        tble = _table

        print("tabela ==> " + _table)

        sQuery = "SELECT " + columns + " FROM " + dtset + ".`" + tble + "`"
        df_return = pandas_gbq.read_gbq(sQuery,
                                        project,
                                        private_key=self.key,
                                        dialect='standard')

        return df_return
 def get_df(self, data_source: GoogleBigQueryDataSource) -> pd.DataFrame:
     """
     Uses Pandas read_gbq method to extract data from Big Query into a dataframe
     See: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.gbq.read_gbq.html
     Note:
         The parameter reauth is set to True to force Google BigQuery to reauthenticate the user
         for each query. This is necessary when extracting multiple data to avoid the error:
         [Errno 54] Connection reset by peer
     """
     credentials = (
         self.credentials.get_google_oauth2_credentials().with_scopes(
             self.scopes))
     return pandas_gbq.read_gbq(query=data_source.query,
                                project_id=self.credentials.project_id,
                                credentials=credentials,
                                dialect=self.dialect)
Ejemplo n.º 29
0
def main(project_id):
    # [START bigquery_pandas_gbq_read_gbq_simple]
    import pandas_gbq

    # TODO: Set project_id to your Google Cloud Platform project ID.
    # project_id = "my-project"

    sql = """
    SELECT country_name, alpha_2_code
    FROM `bigquery-public-data.utility_us.country_code_iso`
    WHERE alpha_2_code LIKE 'A%'
    """
    df = pandas_gbq.read_gbq(sql, project_id=project_id)
    # [END bigquery_pandas_gbq_read_gbq_simple]
    print(df)
    return df
Ejemplo n.º 30
0
def load_daily_trans_from_bq(cat, project_id):
    start_time = time.time()

    sql_str = """
    SELECT
        sku_root_id,
        store_id, std_price_per_unit,
        AVG(avg_sales_qty_per_week) as avg_sales_qty,
        AVG(actual_price) as actual_price,
        COUNT(week_start_date) as duration_weeks,
        STDDEV(avg_sales_qty_per_week) as std_dev_sales_qty FROM (with temp as (
            SELECT
                sku_root_id,
                store_id,
                DATE_TRUNC(date, WEEK(MONDAY)) as week_start_date,
                std_price_per_unit, SUM(total_sale_qty) as avg_sales_qty_per_week,
                SAFE_DIVIDE(SUM(total_sale_amt),
                SUM(total_sale_qty)) as actual_price
            FROM `gum-eroski-dev.ETL.aggregate_daily_transaction_to_sku`
                WHERE area in ("ALIMENTACION", "FRESCOS")
                AND category = {c}
                AND promo_flag = false
                #AND store_id in ('149','155','157','159','164','165','182','184','185','190','192','201','207','208','209','212','213','5','6','16','22','25','26','28','29','30','31','36','41','46','47','51','52','68','74','86','87','88','96','98','99','101','103','106','108','119','120','125','138','143','144','263','264','266','280','281','282','283','290','300','302','308','316','320','323','326','330','217','219','223','224','230','231','233','234','235','236','238','240','243','245','248','249','259','393','397','400','401','403','410','418','419','420','422','424','427','429','430','475','331','334','335','346','352','357','358','359','371','378','379','380','381','383','385','387','390','476','479','480','484','488','489','495','496','498','501','502','505','510','512','544','546','547','548','549','550','551','552','553','555','556','558','562','587','599','602','607','727','730','733','734','735','736','748','890','996','1000','1362','1373','1374','1392','1393','1499','1501','1504','2003','2005','3001','608','663','664','665','666','668','669','671','673','674','677','678','679','690','691','718','719','720','721','723','726','3057','3058','3059','3090','3091','3092','3095','3097','3098','3100','3102','3104','3105','3106','3107','3108','3109','3110','3111','3112','3113','3114','3116','3004','3008','3011','3013','3017','3020','3023','3024','3025','3026','3027','3028','3029','3034','3036','3042','3045','3049','3052','3053','3054','3055','3056','3177','3183','3188','3189','3194','3196','3203','3204','3206','3207','3208','3209','3213','3214','3219','3224','3226','3228','3229','3230','3232','3235','3238','3117','3118','3119','3120','3121','3122','3123','3125','3126','3127','3128','3129','3130','3131','3138','3139','3154','3155','3156','3162','3163','3165','3175','3239','3240','3242','3244','3245','3247','3249','3250','3252','3254','3256','3257','3259','3261','3262','3263','3264','3266','3268','3294','3295','3297','3298','3299','3368','3369','3370','3371','3372','3373','3382','3384','3387','3643','3978','3979','3981','3982','3984','3986','3987','3988','3989','3991','3992','3994','3995','3644','3646','3647','3648','3885','3886','3888','3902','3906','3907','3908','3911','3912','3913','3914','3917','3919','3922','3971','3972','3975','3976','3977','4264','4273','4277','4296','4297','4299','4357','4360','4361','4041','4047','4090','4091','4102','4103','4106','4111','4128','4134','4203','4247','4261','4484','4600','4369','4371','4373','4374','4382','4384','4388','4390','4469','4705','4749','4750','4751','4752','4753','4754','4755','4756','4757','4758','4759','4761','4763','4764','4767','4768','4785','4786','4935','4937','5382','6413','6414','6438','6483','7514','7564','7565','7566','7567','7569','7573','8122','8133','8143','8144','8149','8206','8212','8216','8219','8221','9050','9059','9064','6767','6768','9891','271','288','262','3985','3990','4395','6136','6282','6283','6284','7575','8121','8127','8135','8211','9026','9030','9061','9706','9803','9877','9879','9887','9889','9959','210','433','5007','5091','5106','5111','5301','5318','5725','5744','7444','5016','5086','5371','175','187','202','250','399','445','5009','5021','5040','5052','5083','5908','7423')
                AND total_sale_qty <> 0
                GROUP BY sku_root_id,
                    week_start_date,
                    store_id,
                    std_price_per_unit
                )
    SELECT * from temp
    WHERE EXTRACT(MONTH from week_start_date) <> 12
    AND SAFE_DIVIDE(ABS(actual_price-std_price_per_unit),std_price_per_unit)<0.1)
    GROUP BY sku_root_id,
        store_id,
        std_price_per_unit
    """.format(c=cat)
    start = time.time()

    for i in tqdm(range(1), desc='Loading table...'):
        category_table = pandas_gbq.read_gbq(sql_str, project_id=project_id)

    total_time = round((time.time() - start_time) / 60, 1)
    logger.info(
        "Completed loading of category table from Bigquery {a} mins...".format(
            a=total_time))

    return category_table
Ejemplo n.º 31
0
def read_gbq(query, project_id=None, index_col=None, col_order=None,
             reauth=False, verbose=True, private_key=None, dialect='legacy',
             **kwargs):
    r"""Load data from Google BigQuery.

    The main method a user calls to execute a Query in Google BigQuery
    and read results into a pandas DataFrame.

    Google BigQuery API Client Library v2 for Python is used.
    Documentation is available `here
    <https://developers.google.com/api-client-library/python/apis/bigquery/v2>`__

    Authentication to the Google BigQuery service is via OAuth 2.0.

    - If "private_key" is not provided:

      By default "application default credentials" are used.

      If default application credentials are not found or are restrictive,
      user account credentials are used. In this case, you will be asked to
      grant permissions for product name 'pandas GBQ'.

    - If "private_key" is provided:

      Service account credentials will be used to authenticate.

    Parameters
    ----------
    query : str
        SQL-Like Query to return data values
    project_id : str
        Google BigQuery Account project ID.
    index_col : str (optional)
        Name of result column to use for index in results DataFrame
    col_order : list(str) (optional)
        List of BigQuery column names in the desired order for results
        DataFrame
    reauth : boolean (default False)
        Force Google BigQuery to reauthenticate the user. This is useful
        if multiple accounts are used.
    verbose : boolean (default True)
        Verbose output
    private_key : str (optional)
        Service account private key in JSON format. Can be file path
        or string contents. This is useful for remote server
        authentication (eg. jupyter iPython notebook on remote host)

    dialect : {'legacy', 'standard'}, default 'legacy'
        'legacy' : Use BigQuery's legacy SQL dialect.
        'standard' : Use BigQuery's standard SQL (beta), which is
        compliant with the SQL 2011 standard. For more information
        see `BigQuery SQL Reference
        <https://cloud.google.com/bigquery/sql-reference/>`__

    **kwargs : Arbitrary keyword arguments
        configuration (dict): query config parameters for job processing.
        For example:

            configuration = {'query': {'useQueryCache': False}}

        For more information see `BigQuery SQL Reference
        <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__

    Returns
    -------
    df: DataFrame
        DataFrame representing results of query

    """
    pandas_gbq = _try_import()
    return pandas_gbq.read_gbq(
        query, project_id=project_id,
        index_col=index_col, col_order=col_order,
        reauth=reauth, verbose=verbose,
        private_key=private_key,
        dialect=dialect,
        **kwargs)
Ejemplo n.º 32
0
def read_gbq(query, project_id=None, index_col=None, col_order=None,
             reauth=False, verbose=None, private_key=None, dialect='legacy',
             **kwargs):
    """
    Load data from Google BigQuery.

    This function requires the `pandas-gbq package
    <https://pandas-gbq.readthedocs.io>`__.

    Authentication to the Google BigQuery service is via OAuth 2.0.

    - If "private_key" is not provided:

      By default "application default credentials" are used.

      If default application credentials are not found or are restrictive,
      user account credentials are used. In this case, you will be asked to
      grant permissions for product name 'pandas GBQ'.

    - If "private_key" is provided:

      Service account credentials will be used to authenticate.

    Parameters
    ----------
    query : str
        SQL-Like Query to return data values.
    project_id : str
        Google BigQuery Account project ID.
    index_col : str, optional
        Name of result column to use for index in results DataFrame.
    col_order : list(str), optional
        List of BigQuery column names in the desired order for results
        DataFrame.
    reauth : boolean, default False
        Force Google BigQuery to re-authenticate the user. This is useful
        if multiple accounts are used.
    private_key : str, optional
        Service account private key in JSON format. Can be file path
        or string contents. This is useful for remote server
        authentication (eg. Jupyter/IPython notebook on remote host).
    dialect : str, default 'legacy'
        SQL syntax dialect to use. Value can be one of:

        ``'legacy'``
            Use BigQuery's legacy SQL dialect. For more information see
            `BigQuery Legacy SQL Reference
            <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
        ``'standard'``
            Use BigQuery's standard SQL, which is
            compliant with the SQL 2011 standard. For more information
            see `BigQuery Standard SQL Reference
            <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
    verbose : boolean, deprecated
        *Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module
        to adjust verbosity instead
        <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
    kwargs : dict
        Arbitrary keyword arguments.
        configuration (dict): query config parameters for job processing.
        For example:

            configuration = {'query': {'useQueryCache': False}}

        For more information see `BigQuery SQL Reference
        <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__

    Returns
    -------
    df: DataFrame
        DataFrame representing results of query.

    See Also
    --------
    pandas_gbq.read_gbq : This function in the pandas-gbq library.
    pandas.DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
    """
    pandas_gbq = _try_import()
    return pandas_gbq.read_gbq(
        query, project_id=project_id,
        index_col=index_col, col_order=col_order,
        reauth=reauth, verbose=verbose,
        private_key=private_key,
        dialect=dialect,
        **kwargs)
Ejemplo n.º 33
0
def read_gbq(query, project_id=None, index_col=None, col_order=None,
             reauth=False, private_key=None, auth_local_webserver=False,
             dialect='legacy', location=None, configuration=None,
             verbose=None):
    """
    Load data from Google BigQuery.

    This function requires the `pandas-gbq package
    <https://pandas-gbq.readthedocs.io>`__.

    See the `How to authenticate with Google BigQuery
    <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
    guide for authentication instructions.

    Parameters
    ----------
    query : str
        SQL-Like Query to return data values.
    project_id : str, optional
        Google BigQuery Account project ID. Optional when available from
        the environment.
    index_col : str, optional
        Name of result column to use for index in results DataFrame.
    col_order : list(str), optional
        List of BigQuery column names in the desired order for results
        DataFrame.
    reauth : boolean, default False
        Force Google BigQuery to re-authenticate the user. This is useful
        if multiple accounts are used.
    private_key : str, optional
        Service account private key in JSON format. Can be file path
        or string contents. This is useful for remote server
        authentication (eg. Jupyter/IPython notebook on remote host).
    auth_local_webserver : boolean, default False
        Use the `local webserver flow`_ instead of the `console flow`_
        when getting user credentials.

        .. _local webserver flow:
            http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
        .. _console flow:
            http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console

        *New in version 0.2.0 of pandas-gbq*.
    dialect : str, default 'legacy'
        SQL syntax dialect to use. Value can be one of:

        ``'legacy'``
            Use BigQuery's legacy SQL dialect. For more information see
            `BigQuery Legacy SQL Reference
            <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
        ``'standard'``
            Use BigQuery's standard SQL, which is
            compliant with the SQL 2011 standard. For more information
            see `BigQuery Standard SQL Reference
            <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
    location : str, optional
        Location where the query job should run. See the `BigQuery locations
        documentation
        <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
        list of available locations. The location must match that of any
        datasets used in the query.

        *New in version 0.5.0 of pandas-gbq*.
    configuration : dict, optional
        Query config parameters for job processing.
        For example:

            configuration = {'query': {'useQueryCache': False}}

        For more information see `BigQuery REST API Reference
        <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
    verbose : None, deprecated
        Deprecated in Pandas-GBQ 0.4.0. Use the `logging module
        to adjust verbosity instead
        <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.

    Returns
    -------
    df: DataFrame
        DataFrame representing results of query.

    See Also
    --------
    pandas_gbq.read_gbq : This function in the pandas-gbq library.
    pandas.DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
    """
    pandas_gbq = _try_import()
    return pandas_gbq.read_gbq(
        query, project_id=project_id, index_col=index_col,
        col_order=col_order, reauth=reauth, verbose=verbose,
        private_key=private_key, auth_local_webserver=auth_local_webserver,
        dialect=dialect, location=location, configuration=configuration)
Ejemplo n.º 34
0
def read_gbq(query, project_id=None, index_col=None, col_order=None,
             reauth=False, auth_local_webserver=False, dialect=None,
             location=None, configuration=None, credentials=None,
             use_bqstorage_api=None, private_key=None, verbose=None):
    """
    Load data from Google BigQuery.

    This function requires the `pandas-gbq package
    <https://pandas-gbq.readthedocs.io>`__.

    See the `How to authenticate with Google BigQuery
    <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
    guide for authentication instructions.

    Parameters
    ----------
    query : str
        SQL-Like Query to return data values.
    project_id : str, optional
        Google BigQuery Account project ID. Optional when available from
        the environment.
    index_col : str, optional
        Name of result column to use for index in results DataFrame.
    col_order : list(str), optional
        List of BigQuery column names in the desired order for results
        DataFrame.
    reauth : boolean, default False
        Force Google BigQuery to re-authenticate the user. This is useful
        if multiple accounts are used.
    auth_local_webserver : boolean, default False
        Use the `local webserver flow`_ instead of the `console flow`_
        when getting user credentials.

        .. _local webserver flow:
            http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
        .. _console flow:
            http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console

        *New in version 0.2.0 of pandas-gbq*.
    dialect : str, default 'legacy'
        Note: The default value is changing to 'standard' in a future verion.

        SQL syntax dialect to use. Value can be one of:

        ``'legacy'``
            Use BigQuery's legacy SQL dialect. For more information see
            `BigQuery Legacy SQL Reference
            <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
        ``'standard'``
            Use BigQuery's standard SQL, which is
            compliant with the SQL 2011 standard. For more information
            see `BigQuery Standard SQL Reference
            <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.

        .. versionchanged:: 0.24.0
    location : str, optional
        Location where the query job should run. See the `BigQuery locations
        documentation
        <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
        list of available locations. The location must match that of any
        datasets used in the query.

        *New in version 0.5.0 of pandas-gbq*.
    configuration : dict, optional
        Query config parameters for job processing.
        For example:

            configuration = {'query': {'useQueryCache': False}}

        For more information see `BigQuery REST API Reference
        <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
    credentials : google.auth.credentials.Credentials, optional
        Credentials for accessing Google APIs. Use this parameter to override
        default credentials, such as to use Compute Engine
        :class:`google.auth.compute_engine.Credentials` or Service Account
        :class:`google.oauth2.service_account.Credentials` directly.

        *New in version 0.8.0 of pandas-gbq*.

        .. versionadded:: 0.24.0
    use_bqstorage_api : bool, default False
        Use the `BigQuery Storage API
        <https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
        download query results quickly, but at an increased cost. To use this
        API, first `enable it in the Cloud Console
        <https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
        You must also have the `bigquery.readsessions.create
        <https://cloud.google.com/bigquery/docs/access-control#roles>`__
        permission on the project you are billing queries to.

        This feature requires version 0.10.0 or later of the ``pandas-gbq``
        package. It also requires the ``google-cloud-bigquery-storage`` and
        ``fastavro`` packages.

        .. versionadded:: 0.25.0
    private_key : str, deprecated
        Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
        parameter and
        :func:`google.oauth2.service_account.Credentials.from_service_account_info`
        or
        :func:`google.oauth2.service_account.Credentials.from_service_account_file`
        instead.

        Service account private key in JSON format. Can be file path
        or string contents. This is useful for remote server
        authentication (eg. Jupyter/IPython notebook on remote host).
    verbose : None, deprecated
        Deprecated in pandas-gbq version 0.4.0. Use the `logging module to
        adjust verbosity instead
        <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.

    Returns
    -------
    df: DataFrame
        DataFrame representing results of query.

    See Also
    --------
    pandas_gbq.read_gbq : This function in the pandas-gbq library.
    DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
    """
    pandas_gbq = _try_import()

    kwargs = {}

    # START: new kwargs.  Don't populate unless explicitly set.
    if use_bqstorage_api is not None:
        kwargs["use_bqstorage_api"] = use_bqstorage_api
    # END: new kwargs

    # START: deprecated kwargs.  Don't populate unless explicitly set.
    if verbose is not None:
        kwargs["verbose"] = verbose

    if private_key is not None:
        kwargs["private_key"] = private_key
    # END: deprecated kwargs

    return pandas_gbq.read_gbq(
        query, project_id=project_id, index_col=index_col,
        col_order=col_order, reauth=reauth,
        auth_local_webserver=auth_local_webserver, dialect=dialect,
        location=location, configuration=configuration,
        credentials=credentials, **kwargs)