Ejemplo n.º 1
0
def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
    # Default dialect is changing to standard SQL. See:
    # https://github.com/pydata/pandas-gbq/issues/195
    mock_read_gbq = mock.Mock()
    mock_read_gbq.return_value = DataFrame([[1.0]])
    monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
    with tm.assert_produces_warning(FutureWarning):
        pd.read_gbq("SELECT 1")
Ejemplo n.º 2
0
def test_read_gbq_without_new_kwargs(monkeypatch):
    captured_kwargs = {}

    def mock_read_gbq(sql, **kwargs):
        captured_kwargs.update(kwargs)
        return DataFrame([[1.0]])

    monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
    pd.read_gbq("SELECT 1")

    assert "use_bqstorage_api" not in captured_kwargs
Ejemplo n.º 3
0
def test_read_gbq_without_deprecated_kwargs(monkeypatch):
    captured_kwargs = {}

    def mock_read_gbq(sql, **kwargs):
        captured_kwargs.update(kwargs)
        return DataFrame([[1.0]])

    monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
    pd.read_gbq("SELECT 1")

    assert "verbose" not in captured_kwargs
    assert "private_key" not in captured_kwargs
Ejemplo n.º 4
0
def test_read_gbq_with_deprecated_kwargs(monkeypatch):
    captured_kwargs = {}

    def mock_read_gbq(sql, **kwargs):
        captured_kwargs.update(kwargs)
        return DataFrame([[1.0]])

    monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
    private_key = object()
    pd.read_gbq("SELECT 1", verbose=True, private_key=private_key)

    assert captured_kwargs["verbose"]
    assert captured_kwargs["private_key"] is private_key
Ejemplo n.º 5
0
def get_hist_series(table, varList, overflow=True, selection=None):
    if selection is None:
        selection = "INTEGER(selStatus)&2097151=2097151"
    
    variables = ",\n".join([ build_case_string(*var) for var in varList] + ["COUNT(1) as count"])
    groupers =  ','.join([var[1] for var in varList])

    h = """SELECT {1} FROM {0} WHERE {3}
    GROUP BY {2} ORDER BY {2}""".format(table, variables, groupers, selection)

    # Making the request 
    frame = pd.read_gbq(h,'ams-test-kostya')
    
    if not overflow:
        for __, v, __ in varList:
            frame = frame[frame[v] != -1.0]
            
    for __, v, bins in varList:
        frame[v] = frame[v].map(lambda x: -1 if x == -1 else bins[x]  )

    # Populating 
    edges = [ np.append(v[2],-1) if overflow else v[2] for v in varList ]
    newIndex = pd.MultiIndex.from_product( edges,
                                           names=[ v[1] for v in varList ] )
    series = frame.set_index(list(frame.columns[:-1]))['count']
    series = series.reindex(newIndex).dropna() 
    
    return series
Ejemplo n.º 6
0
def hist2DCustomCommand( nBinsX, firstBinX, lastBinX, nBinsY, firstBinY, lastBinY, theCommand):
    binWidth=float(lastBin-firstBin)/nBins

    try:
        return pd.read_gbq(theCommand, project_id=theProjectID)

    except ValueError:
        print 'no json data'
def query_to_dataframe(query):
  import pandas as pd
  import pkgutil
  privatekey = pkgutil.get_data('trainer', 'privatekey.json')
  print(privatekey[:200])
  return pd.read_gbq(query,
                     project_id='cloud-training-demos',
                     dialect='standard',
                     private_key=privatekey)
def query_to_dataframe(query):
  import pandas as pd
  import pkgutil, json
  privatekey = pkgutil.get_data(KEYDIR, 'privatekey.json')
  print(privatekey[:200])
  return pd.read_gbq(query, 
                     project_id=PROJECT, 
                     dialect='standard',
                     private_key=privatekey)
Ejemplo n.º 9
0
def histCustomCommand( theCommand, requery=False):
    dirName=os.environ.get('HOME')+'/.bigQueryCached/'
    if requery is False:
        cachedResult=getHistDataFrame(dirName,theCommand)
        if cachedResult is not None:
            print 'CACHED'
            return cachedResult

    df=pd.read_gbq(theCommand, project_id=theProjectID)
    saveHistDataFrame(df,dirName,theCommand)
    return df
Ejemplo n.º 10
0
    def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=None,
                  private_key=_get_private_key_path())

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             private_key=_get_private_key_path())
        assert result['num_rows'][0] == test_size
Ejemplo n.º 11
0
    def _query_timeseries_model(self):
        query = """
        SELECT
        *
        FROM
        ML.FORECAST(
            MODEL `japan-grid-carbon-api{bqStageName}.{utility}.model_intensity_timeseries`,
            STRUCT({horizon_size} AS horizon)
        )
        """.format(bqStageName=self.bqStageName,
                   utility=self.utility,
                   horizon_size=HORIZON)

        return pd.read_gbq(query)
Ejemplo n.º 12
0
def query_policy_for_current_date():
    today = date.today().strftime("%Y-%m-%d")
    query = """
        SELECT country_name, region_name, date, school_closing_notes, workplace_closing, workplace_closing_notes, cancel_public_events, cancel_public_events_notes, 
        restrictions_on_gatherings, restrictions_on_gatherings_notes, close_public_transit, close_public_transit_notes, stay_at_home_requirements, stay_at_home_requirements_notes, 
        contact_tracing, contact_tracing_notes, confirmed_cases, deaths international_travel_controls, international_travel_controls_notes FROM `bigquery-public-data.covid19_govt_response.oxford_policy_tracker` 
        where ( country_name like 'United Kingdom' or country_name like 'China' or country_name like 'Australia') and date = '%s' order by date asc
    """ % today
    df = pd.read_gbq(query,
                     dialect='standard',
                     project_id=project_id,
                     credentials=credentials)
    df = define_data_type(df)
    return df
Ejemplo n.º 13
0
def test_pandas_gbq_query():
    # [START bigquery_migration_pandas_gbq_query]
    import pandas

    sql = """
        SELECT name
        FROM `bigquery-public-data.usa_names.usa_1910_current`
        WHERE state = 'TX'
        LIMIT 100
    """

    # Run a Standard SQL query using the environment's default project
    df = pandas.read_gbq(sql, dialect='standard')

    # Run a Standard SQL query with the project set explicitly
    project_id = 'your-project-id'
    # [END bigquery_migration_pandas_gbq_query]
    assert len(df) > 0
    project_id = os.environ['GOOGLE_CLOUD_PROJECT']
    # [START bigquery_migration_pandas_gbq_query]
    df = pandas.read_gbq(sql, project_id=project_id, dialect='standard')
    # [END bigquery_migration_pandas_gbq_query]
    assert len(df) > 0
    def _extract_daily_carbon_intensity_by_month_and_weekday_from_big_query(
            self):

        query = """
        SELECT
        EXTRACT(MONTH FROM datetime) AS month,
        EXTRACT(DAYOFWEEK FROM datetime) AS dayofweek,
        EXTRACT(HOUR FROM datetime) AS hour,
        """ + self._get_intensity_query_string() + """
        GROUP BY month, dayofweek, hour
        order by month, dayofweek, hour asc
        """

        return pd.read_gbq(query)
Ejemplo n.º 15
0
def get_all_status_board():
    """
    prod_status_board table 모두 추출
    :return: DataFrame
    """
    query = f'''
    select *
    from `geultto.status_board.{cardinal}_prod`
    '''
    status_board_df = pd.read_gbq(query,
                                  project_id=_project_id,
                                  dialect='standard',
                                  private_key=_jwt)
    return status_board_df
Ejemplo n.º 16
0
    def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=None,
                  private_key=_get_private_key_path())

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             private_key=_get_private_key_path())
        assert result['num_rows'][0] == test_size
Ejemplo n.º 17
0
def states_county(step):
  if(step == 0):
    query = firstQuery + str(tuple(states_list)) + 'AND county IN ' + str(tuple(county_list))
    print "-----------------------------------"
    print "Filtered by STATE AND COUNTY."
    print "-----------------------------------"
  else:
    query = firstQuery + str(tuple(states_list))
    print "-----------------------------------"
    print "Filtered by STATE."
    print "-----------------------------------"
  
  df = pd.read_gbq(query, "starry-braid-156516")
  return df, query
Ejemplo n.º 18
0
def api_pricing():

    sql_prices = '''select location_id, member_type, plan, amount from `bikeshare-303620.TripsDataset.Pricing` '''
    pricing_df = pd.read_gbq(sql_prices,
                             project_id=gcp_project,
                             credentials=credentials,
                             dialect='standard')
    # print(pricing_df)

    json_obj = pricing_df.to_json(orient='records')
    json_loads = json.loads(json_obj)
    json_formatted_str = json.dumps(json_loads, indent=2)

    return json_formatted_str
Ejemplo n.º 19
0
def test_pandas_gbq_legacy_query():
    # [START bigquery_migration_pandas_gbq_query_legacy]
    import pandas

    sql = """
        SELECT name
        FROM [bigquery-public-data:usa_names.usa_1910_current]
        WHERE state = 'TX'
        LIMIT 100
    """

    df = pandas.read_gbq(sql, dialect='legacy')
    # [END bigquery_migration_pandas_gbq_query_legacy]
    assert len(df) > 0
Ejemplo n.º 20
0
def backup_database(event=None, context=None):
    # models currently running needing to be backed up
    # get live portfolio
    portfolio_query = """
        SELECT *
        FROM `oval-bot-232220.algorithmic_trader.portfolio`
    """
    pf_history = pd.read_gbq(portfolio_query).sort_values(
        by='Date').reset_index(drop=True)

    # get backup portfolio
    backup_query = """
        SELECT *
        FROM `oval-bot-232220.algorithmic_trader.backup`
    """
    backup_data = pd.read_gbq(backup_query).sort_values(by='Date').reset_index(
        drop=True)

    # add new portfolio to existing portfolio data and push to gbq
    pf_data = backup_data.append(pf_history.iloc[-1], ignore_index=True)
    pf_data.to_gbq(destination_table='algorithmic_trader.backup',
                   project_id="oval-bot-232220",
                   if_exists='replace')
Ejemplo n.º 21
0
def api_stations():

    sql_stations = f'select * from `bikeshare-303620.TripsDataset.Stations` '
    print(sql_stations)
    stations_df = pd.read_gbq(sql_stations,
                              project_id=gcp_project,
                              credentials=credentials,
                              dialect='standard')
    stations_data = stations_df.to_json(orient='records')

    json_loads = json.loads(stations_data)
    json_formatted_str = json.dumps(json_loads, indent=2)

    return json_formatted_str
Ejemplo n.º 22
0
def get_all_slack_log():
    """
    최소 두 번째 글부터 쌓여있는 prod_slack_log table 모두 추출
    :return: DataFrame
    """
    query = f'''
    select *
    from `geultto.slack_log.{cardinal}_prod`
    '''
    slack_log_df = pd.read_gbq(query,
                               project_id=_project_id,
                               dialect='standard',
                               private_key=_jwt)
    return slack_log_df
Ejemplo n.º 23
0
    def _load(self) -> pd.DataFrame:
        load_args = copy.deepcopy(self._load_args)

        if self._filepath:
            load_path = get_filepath_str(PurePosixPath(self._filepath),
                                         self._protocol)
            with self._fs.open(load_path, mode="r") as fs_file:
                load_args["query"] = fs_file.read()

        return pd.read_gbq(
            project_id=self._project_id,
            credentials=self._credentials,
            **load_args,
        )
def run():
    try:        
        """Removing XML file from system if already exists"""
        for the_file in os.listdir(xml_folder+foldername):
            file_path = os.path.join(xml_folder+foldername, the_file)
            if os.path.isfile(file_path):
                os.unlink(file_path)
                logging.info('Old file removed from system ....')
                            
        """Reading all detail records from SQL in pandas Dataframe  -- in this case we are using Bigquery as Input SQL source"""
        df_detail_stg= pandas.read_gbq(readfileasstring(sql_query_folder+detail_sql_file), project_id=<PROJECT_ID>, dialect='standard')
        """Reading audit data from audit SQL in pandas Dataframe"""
        df_audit_stg= pandas.read_gbq(readfileasstring(sql_query_folder+audit_sql_file), project_id=<PROJECT_ID>, dialect='standard') 
        i = 0
        """Creating root elements and record tags for XML"""
        root = ET.Element("aaa_ncnu_billing")
        
        """Adding audit data"""
        audit = ET.SubElement(root, "audit_fields")
        ET.SubElement(audit, 'run_id').text = str(jobrunid) 
        ET.SubElement(audit, 'processing_date').text = str(time.strftime("%Y-%m-%d")) 
        ET.SubElement(audit, 'detail_record_count').text = str(len(df_detail.index)) 
        for (col,c) in zip(df_audit.columns,list(df_audit)):
            ET.SubElement(audit, c).text = str(df_audit[col][i]) 
            
        """Adding detail records data"""
        doc = ET.SubElement(root, "billing_records")
        """Iterating through all rows in Dataframe"""
        while (i < len(df_detail.index)):     
            rows = ET.SubElement(doc, "MEMBERSHIP")   
            flg=1                  
        """Creating all the subelements as rows and columns from DStaframe"""
            for (col,c) in zip(df_detail.columns,list(df_detail)):
                ET.SubElement(rows, c).text = str(df_detail[col][i])                                                                            
            i = i+1    
        tree = ET.ElementTree(root)         
        tree.write(xml_folder+foldername+"/"+file_name)   
def fundamental_analysis_collection(project_name):
    #ROCE_table=pd.read_gbq('SELECT * FROM pecten_dataset.ROCE_t WHERE Status = "active";', project_id=project_name)
    sales_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.sales_t WHERE Status = "active";', project_id=project_name)
    profit_margin_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.profit_margin_t WHERE Status = "active";', project_id=project_name)
    PER_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.PER_t WHERE Status = "active";', project_id=project_name)
    EPS_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.EPS_t WHERE Status = "active";', project_id=project_name)
    EBITDA_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.EBITDA_t WHERE Status = "active";', project_id=project_name)
    dividend_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.dividend_analysis_t WHERE Status = "active";', project_id=project_name)
    return sales_table,profit_margin_table,PER_table,EPS_table,EBITDA_table,dividend_table
Ejemplo n.º 26
0
def index_table(es, index_name, client, table, participant_id_column,
                sample_id_column, sample_file_columns, billing_project_id):
    """Indexes a BigQuery table.

    Args:
        es: Elasticsearch object.
        index_name: Name of Elasticsearch index.
        table_name: Fully-qualified table name of the format:
            "<project id>.<dataset id>.<table name>"
        participant_id_column: Name of the column containing the participant ID.
        sample_id_column: (optional) Name of the column containing the sample ID
            (only needed on samples tables).
        sample_file_columns: (optional) Mappings for columns which contain genomic
            files of a particular type (specified in ui.json).
        billing_project_id: GCP project ID to bill for reading table
    """
    _create_nested_mappings(es, index_name, table, sample_id_column)
    table_name = _table_name_from_table(table)
    start_time = time.time()
    logger.info('Indexing %s into %s.' % (table_name, index_name))

    # There is no easy way to import BigQuery -> Elasticsearch. Instead:
    # BigQuery table -> pandas dataframe -> dict -> Elasticsearch
    df = pd.read_gbq('SELECT * FROM `%s`' % table_name,
                     project_id=billing_project_id,
                     dialect='standard')
    elapsed_time = time.time() - start_time
    elapsed_time_str = time.strftime('%Hh:%Mm:%Ss', time.gmtime(elapsed_time))
    logger.info('BigQuery -> pandas took %s' % elapsed_time_str)
    logger.info('%s has %d rows' % (table_name, len(df)))

    if not participant_id_column in df.columns:
        raise ValueError(
            'Participant ID column %s not found in BigQuery table %s' %
            (participant_id_column, table_name))

    if sample_id_column in df.columns:
        scripts_by_id = _sample_scripts_by_id(df, table_name,
                                              participant_id_column,
                                              sample_id_column,
                                              sample_file_columns)
        indexer_util.bulk_index_scripts(es, index_name, scripts_by_id)
    else:
        docs_by_id = _docs_by_id(df, table_name, participant_id_column)
        indexer_util.bulk_index_docs(es, index_name, docs_by_id)

    elapsed_time = time.time() - start_time
    elapsed_time_str = time.strftime("%Hh:%Mm:%Ss", time.gmtime(elapsed_time))
    logger.info('pandas -> ElasticSearch index took %s' % elapsed_time_str)
Ejemplo n.º 27
0
    def read_tces_from_bq(input_tce_table, project_id):
        """Read a table of Kepler KOIs from BigQuery.

    Args:
      input_tce_table: BigQuery table name containing TCE data.
      project_id: GCP project ID containing input TCE table.
    Returns:
      pd.DataFrame of TCEs.
    """
        query = 'SELECT * from `{}`'.format(input_tce_table)
        tce_table = pd.read_gbq(query,
                                project_id=project_id,
                                dialect='standard')

        return tce_table
def query_bigquery_df(sql_query):
    """Query Google BigQuery using the provided SQL query and returns pandas DataFrames. Logs errors but does not throw any exceptions while accessing BigQuery.

    Args:
        sql_query (str): The SQL query string.

    Returns:
        pandas.DataFrame: The resulting DataFrame
    """
    assert _BIGQUERY_CLIENT, 'Global bigquery client is not configured yet.'
    try:
        df = pd.read_gbq(sql_query, credentials=GLOBAL_CREDS)
    except Exception as e:
        logger.log_error('Exception during query: {}'.format(e))
    return df
Ejemplo n.º 29
0
def get_profitability_color():
    QUERY = """SELECT Constituent, Constituent_id, Constituent_name,Total_profitability_score,
    (CASE 
        WHEN Total_profitability_score > 32 THEN '1'
        WHEN Total_profitability_score < 20  THEN '-1'
        ELSE '0'
    END) AS Profitability_color
    FROM pecten_dataset_test.Profitability_score_ranking 
    WHERE Status="active"
    GROUP BY Constituent_name, Constituent, Constituent_id, Total_profitability_score, Profitability_color
    ORDER BY Constituent"""
    profitability_colors = pd.read_gbq(QUERY,
                                       project_id='igenie-project',
                                       private_key=None)
    return profitability_colors
Ejemplo n.º 30
0
def get_risk_color():
    QUERY = """SELECT Constituent, Constituent_id, Constituent_name,Risk_score,
    (CASE 
        WHEN Risk_score > 4 THEN '-1'
        WHEN Risk_score < 2  THEN '1'
        ELSE '0'
    END) AS Risk_color
    FROM pecten_dataset_test.Risk_ranking 
    WHERE Status="active"
    GROUP BY Constituent_name, Constituent, Constituent_id, Risk_score, Risk_color
    ORDER BY Constituent"""
    risk_colors = pd.read_gbq(QUERY,
                              project_id='igenie-project',
                              private_key=None)
    return risk_colors
Ejemplo n.º 31
0
def get_stats(client, project, table_ref, empty_string='""', max_size=50):
    schema = client.get_table(f"{table_ref.dataset_id}.{table_ref.table_id}").schema
    num_columns = len(schema)
    num_repeats = -(-num_columns // max_size)

    sqls = (
        " UNION ALL ".join(
            aggregate(f, table_ref, j * max_size + i, empty_string)
            for i, f in enumerate(schema[j * max_size : min(num_columns, (j + 1) * max_size)])
        )
        + " ORDER BY ord;"
        for j in range(num_repeats)
    )
    dfs = (pd.read_gbq(sql, project_id=project, dialect="standard") for sql in sqls)
    return pd.concat(dfs)
Ejemplo n.º 32
0
def get_london_cycle_hires(
    credentials: service_account.Credentials,
) -> pd.DataFrame:
    query = sql

    # Create dataframe for migrating meterpoints
    df = pd.read_gbq(
        query,
        project_id="london-cycles-306117",
        dialect="standard",
        credentials=credentials,
        location="EU",
    )

    return df
Ejemplo n.º 33
0
def get_covid_deaths(
    credentials: service_account.Credentials,
) -> pd.DataFrame:
    query = sql

    # Create dataframe for migrating meterpoints
    df = pd.read_gbq(
        query,
        project_id="covid-deaths",
        dialect="standard",
        credentials=credentials,
        location="EU",
    )

    return df
Ejemplo n.º 34
0
def process_standard(indicator, identifier):
    sql = """
    SELECT
      {} as {},
      {}_raw as raw,
      {}_score as score,
      {}_cat as cat,
      {}_label as label
    FROM
      `{}.{}.{}`
    """.format(identifier, IDENTIFIERS[identifier], indicator, indicator,
               indicator, indicator, BQ_PROJECT_ID, BQ_DATASET_NAME,
               BQ_IN[indicator])
    df = pd.read_gbq(query=sql, dialect="standard")
    return df
Ejemplo n.º 35
0
def main():
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], 'hi:o:k:v',
            ['help', 'input=', 'output=', 'key=', 'verbose'])
    except getopt.GetoptError as err:
        print(err)
        sys.exit(2)
    verbose = False
    inputfile = 'input.sql'
    outputfile = 'output.csv'
    key = 'key.json'
    project_id = ''
    for o, a in opts:
        if o in ('-h', '--help'):
            print(
                "usage: gbq-csv-cli.py -i <inputfile> -o <outputfile>. By default takes 'input.sql' as input and 'output.csv' as output, 'key.json' as credentials key"
            )
            sys.exit()
        elif o in ('-v', '--verbose'):
            verbose = True
        elif o in ('-i', '--input'):
            inputfile = a
        elif o in ('-o', '--output'):
            outputfile = a
        elif o in ('-k', '--key'):
            key = a
    print("input is {}, output is {}, key is {}\n".format(
        inputfile, outputfile, key)) if verbose else None
    try:
        query = open(inputfile, 'r').read()
        print("query: {}".format(query)) if verbose else None
        project_id = json.load(open(key))['project_id']
        print("project_id is {}".format(project_id)) if verbose else None
    except Exception as e:
        print("exception: {}".format(str(e)))
        sys.exit(2)
    try:
        df = pd.read_gbq(query,
                         project_id=project_id,
                         dialect="standard",
                         private_key=key)
        print(df.head()) if verbose else None
        df.to_csv(outputfile, index=False)
        print("Done! {} rows saved to {}".format(len(df), outputfile))
    except Exception as e:
        print("Error running query: {}".format(str(e)))
        sys.exit(2)
Ejemplo n.º 36
0
def get_recs_w_date_range(date_s,date_e):
    """Iterate through various date ranges to create the a timeframe sample for later aggregation"""
    global dfx2   # Otherwise, dfx2 is considered Local and will not be global scope of the dataframe created above
    query = """SELECT Timestamp, dst_addr,
                 cast(dst_port as integer) as dst_port,
                 cast(duration_ms as integer) as duration_ms,
                 cast(bytes as integer) as bytes,
                 protocol, flow_direction 
               FROM ipfix.ipfix                
               WHERE Timestamp BETWEEN timestamp('{}') AND timestamp('{}')     
               LIMIT 10 """.format(date_s,date_e)
    #print('The value of local var date_s is: {}'.format(date_s))
    #print('The value of local var date_e is: {}'.format(date_e))
    dfx1 = pd.read_gbq(query, project_id="network-sec-analytics",reauth=True)
    dfx2 = dfx2.append(dfx1)                                       # Append onto the dfx2 dataframe
    return           
Ejemplo n.º 37
0
def get_bigquery(args):
    from utils.Storage import Storage
    storage_client = Storage(args.google_key_path)
    q1 = """SELECT
  date AS ds,
  closing_price AS y
FROM
`pecten_dataset.stockprice_historical_collection`
where 
constituent_name = 'DAIMLER AG'
order by 
date"""

    query1 = pd.read_gbq(q1, project_id='igenie-project', dialect='standard')
    #    print(query1)
    args.query1 = query1
Ejemplo n.º 38
0
def get_attack_on_civilians():
    # SQL query for attack on civilians (event code) by country code in 2018
    query = ('''
        SELECT country_code, COUNT(*) AS attacks
        FROM (SELECT Actor2CountryCode country_code FROM [gdelt-bq:gdeltv2.events] WHERE YEAR=2018 AND EventCode="170")
        GROUP BY country_code''')

    # Run query and convert to panda data frame
    df = pd.read_gbq(query, "gdelt-201419")

    # Assign country name
    df_label = get_country_by_ISO3_code()
    df = pd.merge(df, df_label, on='country_code')
    df = df[['country_code', 'country', 'attacks']]

    return df
Ejemplo n.º 39
0
def api_weather():
    # locationID = 1
    # startDate = '01/01/2019'
    # endDate = '12/31/2019'
    sql_weather = f'select forecast_date, maxTempC, humidity, total_precip, avg_cloudcover, avg_windspeed, location_id ' \
                    f'from `bikeshare-303620.TripsDataset.HistoricalWeather` order by location_id, forecast_date'
    weather_df = pd.read_gbq(sql_weather,
                             project_id=gcp_project,
                             credentials=credentials,
                             dialect='standard')
    weather = weather_df.to_json(orient='records')

    json_loads = json.loads(weather)
    json_formatted_str = json.dumps(json_loads, indent=2)

    return json_formatted_str
Ejemplo n.º 40
0
def get_geohash_of_point(centroid, project_id=None):
    if not project_id:
        project_id = DEFAULT_PROJECT

    sql = '''with test as (
    SELECT
    st_contains(st_geogfromtext(wkt_geom),
        st_geogpoint({lon},
            {lat}) ) as grid,
            geohash
    FROM
    `stormwaterheatmap-hydrology.geometry.geohash` )
    select geohash from test
    where grid
    '''.format(lon=centroid.x, lat=centroid.y)
    return pandas.read_gbq(sql, project_id=project_id).geohash[0]
def load_product():
    print('Loading product')
    query = '''
SELECT
  product_id,
  product_type_id,
  brand_id,
  manufacturer_id,
  product_group_id,
  team_id,
  subproduct_type_id
FROM
  `coolblue-bi-platform-prod.master.product`
'''

    return pd.read_gbq(query, 'coolblue-bi-platform-prod', dialect='standard')
Ejemplo n.º 42
0
    def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=10000,
                  private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             private_key=_get_private_key_path())
        self.assertEqual(result['num_rows'][0], test_size)
Ejemplo n.º 43
0
    def get_data(self, type, station_ids, n_years):
        query = """
            SELECT station_number, year, month, day, {type} as value, rain, snow
            FROM `publicdata.samples.gsod`
            WHERE station_number IN ({stns})
            AND year < 2010
            AND year >= {minyr}
        """.format(
            type=type,
            stns=','.join(station_ids),
            minyr=2010 - n_years
        )

        df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
        df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

        stations_df = pd.DataFrame({
            'location': stations.keys(),
            'station_number': [int(v) for v in stations.values()]
        })
        df = pd.merge(df, stations_df, on='station_number')
        return df
Ejemplo n.º 44
0
def main(group1, group2, d, sample_size, iterations, bin_size, test_type, experiment_id):
    project_id = "crafty-campaign-106420"

    test_data = pd.read_gbq(
        "select * from AvengersCrossEvent.v_ABTest_%s where experiment_id = %s"
        % (string.upper(test_type), experiment_id),
        project_id,
    )

    p_values, day_d, day_d2 = resampling(
        test_data, group1, group2, iterations, sample_size, d, test_type
    )  # 75.2 #161.396529913 #321.671721935
    rejected_perc = reject_rate(p_values)
    print "--------------------------------------------------------------"
    print "Summary"
    print "Out of %i simulated tests, reject null hypothesis of equal means %d%% of the time" % (
        iterations,
        rejected_perc,
    )

    plot_dist(day_d, day_d2, group1, group2, bin_size, test_type)
    print np.mean(day_d), np.mean(day_d2)
Ejemplo n.º 45
0
def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None):
  """Read data from BigQuery and split into train and validation sets.

  Args:
    full_table_path: (string) full path of the table containing training data
      in the format of [project_id.dataset_name.table_name].
    project_id: (string, Optional) Google BigQuery Account project ID.
    num_samples: (int, Optional) Number of data samples to read.

  Returns:
    pandas.DataFrame
  """

  query = metadata.BASE_QUERY.format(table=full_table_path)
  limit = ' LIMIT {}'.format(num_samples) if num_samples else ''
  query += limit

  # Use "application default credentials"
  # Use SQL syntax dialect
  data_df = pd.read_gbq(query, project_id=project_id, dialect='standard')

  return data_df
Ejemplo n.º 46
0
def check_ga_session_date_exist(destination_table, date, credential_path):  # check if destination table has data of certain date
    if not check_table_exist(destination_table):  # has no certain date if the table not exist
        return False
    query = 'SELECT count(*) FROM [%s] WHERE DATE(ga_session_date) == "%s"' % (destination_table, date.strftime("%Y-%m-%d"))
    return (pd.read_gbq(query, project_id=config.PROJECT_ID, verbose=False, private_key=credential_path).iloc[0, 0] > 0)
Ejemplo n.º 47
0
    return {"status": "failure", "data_size": total_row}


if __name__ == "__main__":
    # Parse the argument to get the credential_path
    parser = argparse.ArgumentParser(description='Input secre_json_path and corresponding dataset')
    parser.add_argument('--credential_path', type=str, dest='credential_path', required=True, help='input the path of service account credential from gcp, use $gcp_service_account in jenkings')
    args = vars(parser.parse_args())
    credential_path = args["credential_path"]
    # Use google_auth library to get access to google
    Auth = google_auth(credential_path)
    bigquery = Auth.get_auth('bigquery_v2')
    analytics = Auth.get_auth('analytics_v4')
    # Check if the GA_BQ_UPLOAD_STATUS_LOG table exist in gbq
    if check_table_exist(config.GA_BQ_UPLOAD_STATUS_LOG):
        ga_bq_upload_status_log = pd.read_gbq(query="SELECT * FROM [%s]" % config.GA_BQ_UPLOAD_STATUS_LOG, project_id=config.PROJECT_ID, private_key=credential_path)
    else:
        ga_bq_upload_status_log = pd.DataFrame(columns=['config_name', 'ga_session_date', 'status', 'backup_date', "uploaded_data_size"])
    # Set the time region
    d = config.DATE_INIT.split("-")
    date_init = datetime.date(int(d[0]),int(d[1]),int(d[2]))
    date_now = datetime.datetime.now().date()

    for config_name in config.ga_bq_config:
        for date in daterange(date_init, date_now):
            destination_table = config.ga_bq_config[config_name]["destination_table"]
            print "start checking (%s, %s) pair for GA to BQ" % (config_name, date)
            condition = (ga_bq_upload_status_log["config_name"]==config_name) & (ga_bq_upload_status_log["ga_session_date"]==date.strftime("%Y-%m-%d"))
            if ga_bq_upload_status_log[condition].empty:  # no such condition, totally new table-date pair
                print 'find no pair within the record, try to upload data with (%s, %s)' % (config_name, date)
                if check_ga_session_date_exist(destination_table, date, credential_path):
Ejemplo n.º 48
0
    
    user = "******"+str(i)
    print "Analysis for <arlearn"+str(i)+"@gmail.com>"
    
    # --------------------
    #/    RATINGS        /
    #--------------------
    
    # Collect all the ratings    
    RTquery = "SELECT *  FROM "+table_name+" WHERE " \
    "origin = 'rating' AND actorID='mailto:arlearn"+str(i)+"@gmail.com' " \
    " AND timestamp > PARSE_UTC_USEC('"+startDay+" 07:00:00') AND timestamp < " \
    " PARSE_UTC_USEC('"+endDay+" 20:00:00') ORDER by timestamp"
    
    # Populating the dataframe 
    RTframe = pd.read_gbq(RTquery, project_id) 
    # Filtering the results
    RTdf = RTframe[['timestamp','objectId','resultResponse','lat','lng']]
    #Rename the columns
    RTdf.rename(columns={'objectId':'Indicators'}, inplace=True)
    RTdf.rename(columns={'resultResponse':'value'}, inplace=True)    
    #Reshape the DataFrame
    RTrsh = RTdf.set_index(['timestamp','Indicators'])['value']
    # In case of duplicates     
    RTrsh = RTrsh.drop(RTrsh.index.get_duplicates())
    RTrsh = RTrsh.unstack() 
    
    # Fix: the index will shift -1 hr. E.g. 9:00 -> 8:00 
    # Indicating the rating done at 9:00 for the 8:xx activities    
    RTrsh.index = RTrsh.index-pd.offsets.Hour(1)
def query_to_dataframe(query):
  import pandas as pd
  return pd.read_gbq(query,
                     project_id='cloud-training-demos',
                     dialect='standard')
Ejemplo n.º 50
0
 def get_df(self, query):
     return pd.read_gbq(query=query, project_id=self.project_id, private_key=self.credential_path, verbose=False)
def query_to_dataframe(query):
  import pandas as pd
  return pd.read_gbq(query,
                     project_id='cloud-training-demos',
                     dialect='standard',
                     private_key='trainer/privatekey.json')
Ejemplo n.º 52
0
#job_id, _results = client.query('select * from tqin.v_ABRetention')


#complete = False

#while True:
#    if complete == False:
#        complete, row_count = client.check_job(job_id)
#    else:
#        break

#results = client.get_query_rows(job_id)

#retention_data = json_normalize(results)

retention_data = pd.read_gbq('select * from tqin.v_ABRetention', project_id)

#retention_data = pd.read_csv('/Users/thomasqin/Documents/python/abretention.csv')
test_groups = ['control','v1','v2','v3']
#now = datetime.now()
#current_date = now.strftime("%m-%d-%y")


#date_converter = lambda x: datetime.fromtimestamp(x)

#retention_data['install_date'] = retention_data['install_date'].apply(date_converter)

#player_id_control = retention_data_control['player_id']
#player_id_test = retention_data_test['player_id']