Esempio n. 1
0
def _telem_parsed_to_sql(date_str,
                         channel,
                         measure_iter,
                         version    = None,
                         query_file = _UPDATE_SQL_FILE):
    """ Loads _INTERMEDIATE_CSV into the sentiment database """

    db = Db('telemetry', is_persistent = True)

    #version query
    if not version:
        query = """
                SELECT
                    version
                FROM
                    sentiment.release_info ri
                WHERE
                    '{date}' >= ri.{channel}_start_date
                    AND '{date}' <= ri.{channel}_end_date
            ;""".format(date=date_str, channel=channel)
        version = str(db.execute_sql(query).first()[0])


    # TODO(rrayborn): figure out why this won't work as a temp table with our
    # consistency rules + sqlalchemy's implicit transactions
    query ='''DROP TABLE IF EXISTS tmp_weekly_stats;'''
    db.execute_sql(query)

    query ='''CREATE TABLE tmp_weekly_stats (
            os                       ENUM('Windows','Mac','Linux'),
            measure                  VARCHAR(200),
            measure_value            VARCHAR(200),
            users                    INT,
            measure_average          FLOAT,
            measure_nonzero_average  FLOAT,
            active_users             FLOAT,
            potential_users          INT
        );'''
    db.execute_sql(query)
    mappings = {
                0:'measure',
                1:'os',
                2:'measure_average',
                3:'measure_nonzero_average',
                4:'active_users',
                5:'potential_users',
                6:'measure_value'
            }
    db.insert_data_into_table(measure_iter, 'tmp_weekly_stats', mappings)


    with open(query_file, 'r') as query_sql:
        if query_sql:
            query = query_sql.read()

    db.execute_sql(query, {'week':date_str, 'channel':channel, 'version':version})
Esempio n. 2
0
def update(
            product    = 'both',
            start_date = (date.today() - timedelta(days=15)).strftime('%Y-%m-%d'),
            end_date   = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d'),
            ua_db      = None,
            input_db   = None,
            sumo_db    = None
        ):

    if not ua_db:
        ua_db    = UA_DB('sentiment', is_persistent = True)
        ua_db.execute_sql('SET autocommit=1;')
    if not input_db:
        input_db = Input_DB(is_persistent = True)
    if not sumo_db:
        sumo_db  = Sumo_DB(is_persistent = True)

    adis_file   = _DATA_PATH + '.' + product +'_adis.tsv'
    input_file  = _DATA_PATH + '.' + product +'_adis.tsv'
    visits_file = _DATA_PATH + '.' + product +'_visits.tsv'

    params = {'start_date': start_date, 'end_date': end_date}

    # =========== Create tables ============================================
    _execute_query(ua_db, _CREATE_SQL_FILE, params=params, multi=True)

    # =========== Parse heartbeat data =========================================
    #TODO(rrayborn): should this be its own pipeline?
    
    # Fetch Input data
    data = _execute_query(input_db, _HEARTBEAT_SQL_FILE, params=params)

    # Insert Input data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'daily_heartbeat_stats', header,
                                 has_header = False, is_replace=True)

    # =========== Parse input data =============================================
    
    # Fetch Input data
    data = _execute_query(input_db, _INPUT_SQL_FILE, params=params)

    # Create tmp_input table
    query ='''CREATE TEMPORARY TABLE tmp_input (
                `date`                     DATE,
                `version`                  INT,
                `is_desktop`               BOOL,
                `input_average`            FLOAT,
                `input_volume`             INT,
                `heartbeat_average`        FLOAT,
                `heartbeat_surveyed_users` INT,
                `heartbeat_volume`         INT
            );'''
    ua_db.execute_sql(query)

    # Insert Input data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'tmp_input', header, has_header = False)
    
    # =========== Create base table ============================================
    _execute_query(ua_db, _BASE_SQL_FILE, params=params)
    
    # =========== Parse sumo data =============================================
    
    # Fetch Sumo data
    data = _execute_query(sumo_db, _SUMO_SQL_FILE, params=params)

    # Create tmp_sumo table
    query ='''CREATE TEMPORARY TABLE tmp_sumo (
                `date`                     DATE,
                `version`                  INT,
                `is_desktop`               BOOL,
                `num_unanswered_72`        INT,
                `num_posts`                INT
            );'''
    ua_db.execute_sql(query)

    # Insert Sumo data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'tmp_sumo', header, has_header = False)

    # =========== Parse ADI data ===============================================
    
    # Generate query
    # TODO(rrayborn): Need to investigate why part of the end date is missing.
    #                 Doesn't seem to affect the start_date...
    today_minus_three = (date.today() - timedelta(days=3)).strftime('%Y-%m-%d')
    adi_end_date = min(today_minus_three, end_date)
    with open(_ADIS_SQL_FILE, 'r') as adis_sql:
        query = adis_sql.read().replace('\n','  ') % (start_date, adi_end_date)

    # Generate/execute command line
    cmd = 'echo "%s" | isql -v metrics_dsn  -b -x0x09  >%s' # | tail -n+10'
    check_output(cmd % (query, adis_file), shell=True)

    # Create tmp table
    query ='''CREATE TEMPORARY TABLE tmp_adis (
                `date`                DATE,
                version               INT,
                is_desktop            BOOL,
                num_adis              INT
            );'''
    ua_db.execute_sql(query)

    header = ['date', 'version', 'is_desktop', 'num_adis']

    ua_db.insert_csv_into_table(adis_file, 'tmp_adis', header, delimiter = '\t')
    
    # =========== Parse Analytics data =========================================

    # Get Google analytics data
    google_analytics.generate_inproduct( 
            db          = ua_db,
            device_type = product, 
            filename    = visits_file,
            start_date  = start_date,
            end_date    = end_date
        )

    # Create tmp table
    query ='''CREATE TEMPORARY TABLE tmp_sumo_visits (
                `date`      DATE, 
                version     INT,
                is_desktop  BOOL,
                visits      INT
            );'''
    ua_db.execute_sql(query)

    header = ['date', 'version', 'is_desktop', 'visits']
    ua_db.insert_csv_into_table(visits_file, 'tmp_sumo_visits', header, delimiter = '\t')

    
    # =========== Parse Play data ==============================================

    query = '''CREATE TEMPORARY TABLE tmp_play AS 
        SELECT
            `date`, 
            version,
            AVG(rating) AS play_average,
            COUNT(*)    AS play_volume
        FROM google_play_reviews
        WHERE
                `date` >= :start_date
            AND `date` <= :end_date
        GROUP BY 1,2;
    '''
    ua_db.execute_sql(query, params)
    
    # =========== Run Stats query ==============================================
    query_files = []
    if product == 'both' or product == 'desktop':
        query_files.append(_DESKTOP_FILE_PATTERN)
    if product == 'both' or product == 'mobile':
        query_files.append(_MOBILE_FILE_PATTERN)

    for query_file in query_files:
        _execute_query(ua_db,query_file, params=params)
def update(product='both',
           start_date=(date.today() - timedelta(days=15)).strftime('%Y-%m-%d'),
           end_date=(date.today() - timedelta(days=1)).strftime('%Y-%m-%d'),
           ua_db=None,
           input_db=None,
           sumo_db=None):

    if not ua_db:
        ua_db = UA_DB('sentiment', is_persistent=True)
        ua_db.execute_sql('SET autocommit=1;')
    if not input_db:
        input_db = Input_DB(is_persistent=True)
    if not sumo_db:
        sumo_db = Sumo_DB(is_persistent=True)

    adis_file = _DATA_PATH + '.' + product + '_adis.tsv'
    input_file = _DATA_PATH + '.' + product + '_adis.tsv'
    visits_file = _DATA_PATH + '.' + product + '_visits.tsv'

    params = {'start_date': start_date, 'end_date': end_date}

    # =========== Create tables ============================================
    _execute_query(ua_db, _CREATE_SQL_FILE, params=params, multi=True)

    # =========== Parse heartbeat data =========================================
    #TODO(rrayborn): should this be its own pipeline?

    # Fetch Input data
    data = _execute_query(input_db, _HEARTBEAT_SQL_FILE, params=params)

    # Insert Input data
    header = data.keys()
    ua_db.insert_data_into_table(data,
                                 'daily_heartbeat_stats',
                                 header,
                                 has_header=False,
                                 is_replace=True)

    # =========== Parse input data =============================================

    # Fetch Input data
    data = _execute_query(input_db, _INPUT_SQL_FILE, params=params)

    # Create tmp_input table
    query = '''CREATE TEMPORARY TABLE tmp_input (
                `date`                     DATE,
                `version`                  INT,
                `is_desktop`               BOOL,
                `input_average`            FLOAT,
                `input_volume`             INT,
                `heartbeat_average`        FLOAT,
                `heartbeat_surveyed_users` INT,
                `heartbeat_volume`         INT
            );'''
    ua_db.execute_sql(query)

    # Insert Input data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'tmp_input', header, has_header=False)

    # =========== Create base table ============================================
    _execute_query(ua_db, _BASE_SQL_FILE, params=params)

    # =========== Parse sumo data =============================================

    # Fetch Sumo data
    data = _execute_query(sumo_db, _SUMO_SQL_FILE, params=params)

    # Create tmp_sumo table
    query = '''CREATE TEMPORARY TABLE tmp_sumo (
                `date`                     DATE,
                `version`                  INT,
                `is_desktop`               BOOL,
                `num_unanswered_72`        INT,
                `num_posts`                INT
            );'''
    ua_db.execute_sql(query)

    # Insert Sumo data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'tmp_sumo', header, has_header=False)

    # =========== Parse ADI data ===============================================

    # Generate query
    # TODO(rrayborn): Need to investigate why part of the end date is missing.
    #                 Doesn't seem to affect the start_date...
    today_minus_three = (date.today() - timedelta(days=3)).strftime('%Y-%m-%d')
    adi_end_date = min(today_minus_three, end_date)
    with open(_ADIS_SQL_FILE, 'r') as adis_sql:
        query = adis_sql.read().replace('\n',
                                        '  ') % (start_date, adi_end_date)

    # Generate/execute command line
    cmd = 'echo "%s" | isql -v metrics_dsn  -b -x0x09  >%s'  # | tail -n+10'
    check_output(cmd % (query, adis_file), shell=True)

    # Create tmp table
    query = '''CREATE TEMPORARY TABLE tmp_adis (
                `date`                DATE,
                version               INT,
                is_desktop            BOOL,
                num_adis              INT
            );'''
    ua_db.execute_sql(query)

    header = ['date', 'version', 'is_desktop', 'num_adis']

    ua_db.insert_csv_into_table(adis_file, 'tmp_adis', header, delimiter='\t')

    # =========== Parse Analytics data =========================================

    # Get Google analytics data
    google_analytics.generate_inproduct(db=ua_db,
                                        device_type=product,
                                        filename=visits_file,
                                        start_date=start_date,
                                        end_date=end_date)

    # Create tmp table
    query = '''CREATE TEMPORARY TABLE tmp_sumo_visits (
                `date`      DATE, 
                version     INT,
                is_desktop  BOOL,
                visits      INT
            );'''
    ua_db.execute_sql(query)

    header = ['date', 'version', 'is_desktop', 'visits']
    ua_db.insert_csv_into_table(visits_file,
                                'tmp_sumo_visits',
                                header,
                                delimiter='\t')

    # =========== Parse Play data ==============================================

    query = '''CREATE TEMPORARY TABLE tmp_play AS 
        SELECT
            `date`, 
            version,
            AVG(rating) AS play_average,
            COUNT(*)    AS play_volume
        FROM google_play_reviews
        WHERE
                `date` >= :start_date
            AND `date` <= :end_date
        GROUP BY 1,2;
    '''
    ua_db.execute_sql(query, params)

    # =========== Run Stats query ==============================================
    query_files = []
    if product == 'both' or product == 'desktop':
        query_files.append(_DESKTOP_FILE_PATTERN)
    if product == 'both' or product == 'mobile':
        query_files.append(_MOBILE_FILE_PATTERN)

    for query_file in query_files:
        _execute_query(ua_db, query_file, params=params)