Ejemplo n.º 1
0
def _telem_parsed_to_sql(date_str,
                         channel,
                         measure_iter,
                         version    = None,
                         query_file = _UPDATE_SQL_FILE):
    """ Loads _INTERMEDIATE_CSV into the sentiment database """

    db = Db('telemetry', is_persistent = True)

    #version query
    if not version:
        query = """
                SELECT
                    version
                FROM
                    sentiment.release_info ri
                WHERE
                    '{date}' >= ri.{channel}_start_date
                    AND '{date}' <= ri.{channel}_end_date
            ;""".format(date=date_str, channel=channel)
        version = str(db.execute_sql(query).first()[0])


    # TODO(rrayborn): figure out why this won't work as a temp table with our
    # consistency rules + sqlalchemy's implicit transactions
    query ='''DROP TABLE IF EXISTS tmp_weekly_stats;'''
    db.execute_sql(query)

    query ='''CREATE TABLE tmp_weekly_stats (
            os                       ENUM('Windows','Mac','Linux'),
            measure                  VARCHAR(200),
            measure_value            VARCHAR(200),
            users                    INT,
            measure_average          FLOAT,
            measure_nonzero_average  FLOAT,
            active_users             FLOAT,
            potential_users          INT
        );'''
    db.execute_sql(query)
    mappings = {
                0:'measure',
                1:'os',
                2:'measure_average',
                3:'measure_nonzero_average',
                4:'active_users',
                5:'potential_users',
                6:'measure_value'
            }
    db.insert_data_into_table(measure_iter, 'tmp_weekly_stats', mappings)


    with open(query_file, 'r') as query_sql:
        if query_sql:
            query = query_sql.read()

    db.execute_sql(query, {'week':date_str, 'channel':channel, 'version':version})
Ejemplo n.º 2
0
def update(
            product    = 'both',
            start_date = (date.today() - timedelta(days=15)).strftime('%Y-%m-%d'),
            end_date   = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d'),
            ua_db      = None,
            input_db   = None,
            sumo_db    = None
        ):

    if not ua_db:
        ua_db    = UA_DB('sentiment', is_persistent = True)
        ua_db.execute_sql('SET autocommit=1;')
    if not input_db:
        input_db = Input_DB(is_persistent = True)
    if not sumo_db:
        sumo_db  = Sumo_DB(is_persistent = True)

    adis_file   = _DATA_PATH + '.' + product +'_adis.tsv'
    input_file  = _DATA_PATH + '.' + product +'_adis.tsv'
    visits_file = _DATA_PATH + '.' + product +'_visits.tsv'

    params = {'start_date': start_date, 'end_date': end_date}

    # =========== Create tables ============================================
    _execute_query(ua_db, _CREATE_SQL_FILE, params=params, multi=True)

    # =========== Parse heartbeat data =========================================
    #TODO(rrayborn): should this be its own pipeline?
    
    # Fetch Input data
    data = _execute_query(input_db, _HEARTBEAT_SQL_FILE, params=params)

    # Insert Input data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'daily_heartbeat_stats', header,
                                 has_header = False, is_replace=True)

    # =========== Parse input data =============================================
    
    # Fetch Input data
    data = _execute_query(input_db, _INPUT_SQL_FILE, params=params)

    # Create tmp_input table
    query ='''CREATE TEMPORARY TABLE tmp_input (
                `date`                     DATE,
                `version`                  INT,
                `is_desktop`               BOOL,
                `input_average`            FLOAT,
                `input_volume`             INT,
                `heartbeat_average`        FLOAT,
                `heartbeat_surveyed_users` INT,
                `heartbeat_volume`         INT
            );'''
    ua_db.execute_sql(query)

    # Insert Input data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'tmp_input', header, has_header = False)
    
    # =========== Create base table ============================================
    _execute_query(ua_db, _BASE_SQL_FILE, params=params)
    
    # =========== Parse sumo data =============================================
    
    # Fetch Sumo data
    data = _execute_query(sumo_db, _SUMO_SQL_FILE, params=params)

    # Create tmp_sumo table
    query ='''CREATE TEMPORARY TABLE tmp_sumo (
                `date`                     DATE,
                `version`                  INT,
                `is_desktop`               BOOL,
                `num_unanswered_72`        INT,
                `num_posts`                INT
            );'''
    ua_db.execute_sql(query)

    # Insert Sumo data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'tmp_sumo', header, has_header = False)

    # =========== Parse ADI data ===============================================
    
    # Generate query
    # TODO(rrayborn): Need to investigate why part of the end date is missing.
    #                 Doesn't seem to affect the start_date...
    today_minus_three = (date.today() - timedelta(days=3)).strftime('%Y-%m-%d')
    adi_end_date = min(today_minus_three, end_date)
    with open(_ADIS_SQL_FILE, 'r') as adis_sql:
        query = adis_sql.read().replace('\n','  ') % (start_date, adi_end_date)

    # Generate/execute command line
    cmd = 'echo "%s" | isql -v metrics_dsn  -b -x0x09  >%s' # | tail -n+10'
    check_output(cmd % (query, adis_file), shell=True)

    # Create tmp table
    query ='''CREATE TEMPORARY TABLE tmp_adis (
                `date`                DATE,
                version               INT,
                is_desktop            BOOL,
                num_adis              INT
            );'''
    ua_db.execute_sql(query)

    header = ['date', 'version', 'is_desktop', 'num_adis']

    ua_db.insert_csv_into_table(adis_file, 'tmp_adis', header, delimiter = '\t')
    
    # =========== Parse Analytics data =========================================

    # Get Google analytics data
    google_analytics.generate_inproduct( 
            db          = ua_db,
            device_type = product, 
            filename    = visits_file,
            start_date  = start_date,
            end_date    = end_date
        )

    # Create tmp table
    query ='''CREATE TEMPORARY TABLE tmp_sumo_visits (
                `date`      DATE, 
                version     INT,
                is_desktop  BOOL,
                visits      INT
            );'''
    ua_db.execute_sql(query)

    header = ['date', 'version', 'is_desktop', 'visits']
    ua_db.insert_csv_into_table(visits_file, 'tmp_sumo_visits', header, delimiter = '\t')

    
    # =========== Parse Play data ==============================================

    query = '''CREATE TEMPORARY TABLE tmp_play AS 
        SELECT
            `date`, 
            version,
            AVG(rating) AS play_average,
            COUNT(*)    AS play_volume
        FROM google_play_reviews
        WHERE
                `date` >= :start_date
            AND `date` <= :end_date
        GROUP BY 1,2;
    '''
    ua_db.execute_sql(query, params)
    
    # =========== Run Stats query ==============================================
    query_files = []
    if product == 'both' or product == 'desktop':
        query_files.append(_DESKTOP_FILE_PATTERN)
    if product == 'both' or product == 'mobile':
        query_files.append(_MOBILE_FILE_PATTERN)

    for query_file in query_files:
        _execute_query(ua_db,query_file, params=params)
Ejemplo n.º 3
0
def generate_inproduct(
            db          = None,
            auth_token  = google_services.google_service_connection().get_auth_token(), 
            versions    = None, 
            device_type = 'both', 
            filename    = None,
            start_date  = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d'), 
            end_date    = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d'), 
            ga_ids       = None,
            max_results = _MAX_RESULTS
        ):
    if not versions:
        if not db:
            db = Db('sentiment')
        max_version = db.execute_sql('SELECT MAX(version) FROM release_info;').first()[0]
        versions = range(1,max_version)
    version_filters = []
    for version in versions:
        version_filters.append('ga:browserVersion=@%s.0' % version)

    if not filename:
        filename = device_type + '.tsv'

    device_types = []
    if device_type == 'both' or device_type == 'desktop':
        device_types.append(True)
    if device_type == 'both' or device_type == 'mobile':
        device_types.append(False)

    data = {}
    # SET UP REQUEST
    for is_desktop in device_types:
        if not ga_ids:
            if is_desktop:
                ga_ids = _DESKTOP_GA_ID
            else:
                ga_ids = _MOBILE_GA_ID

        device_categories = []
        if is_desktop:
            device_categories = ['ga:deviceCategory==desktop']
        else:
            device_categories = ['ga:deviceCategory==mobile',
                                 'ga:deviceCategory==tablet']

        dimensions = ['ga:date', 'ga:browserVersion']

        filters = [
                ','.join(version_filters),
                ','.join(device_categories),
                'ga:browser==Firefox',
                'ga:source=@inproduct'
            ]

        parameters = {
                'ids':          ga_ids,
                'access_token': auth_token,
                'max-results':  max_results,
                'filters':      ';'.join(filters),
                'dimensions':   ','.join(dimensions),
                'metrics':      'ga:sessions',
                'samplingLevel':'HIGHER_PRECISION'
            }

        # MAKE REQUESTS
        # This is done in a for loop since it's easy to hit Google's data limit
        # in a single multi-day request.
        day_datetime = datetime.strptime(start_date, '%Y-%m-%d')
        end_datetime = datetime.strptime(end_date, '%Y-%m-%d')
        while day_datetime <= end_datetime:
            day = day_datetime.strftime('%Y-%m-%d')
            parameters['start-date'] = day
            parameters['end-date']   = day
            _make_request(parameters, is_desktop, data)
            day_datetime += timedelta(days=1)
    
    with open(filename, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter='\t',
                                quotechar='\"', quoting=csv.QUOTE_MINIMAL)

        header = ['day','version','is_desktop','visits']
        spamwriter.writerow(header)
        for day in sorted(data.keys()):
            for version in sorted(data[day].keys()):
                for is_desktop in sorted(data[day][version].keys()):
                    spamwriter.writerow([day, version, is_desktop, 
                                        data[day][version][is_desktop]])
from subprocess import call

from lib.database.backend_db import Db

_PIPELINE_PATH = path.dirname(path.realpath(__file__)) + '/'
_OTHER_SQL_FILE = _PIPELINE_PATH + 'other.sql'
_COUNTS_SQL_FILE = _PIPELINE_PATH + 'counts.sql'
_DUMP_SQL_FILE = _PIPELINE_PATH + 'dump.sql'

#TODO(rrayborn): Make this a relative path
_OUTPUT_PATH = '/var/server/server/useradvocacy/data/static_json/'
_OUTPUT_JSON_NAME = 'hello.json'
_OUTPUT_CSV_NAME = 'hello.csv.gz'

#TODO(rrayborn): make this more elegant
_INPUT_DB = Db('input')


def update(start_date=date.today() - timedelta(days=6 * 7),
           end_date=date.today() - timedelta(days=1),
           last_run_date=date.today() - timedelta(days=1),
           output_path=_OUTPUT_PATH):
    '''
    Updates the Hello files.

    Args:
        start_date    (datetime): start date of data to pull, inclusive (default: 42 days ago)
        end_date      (datetime): end date of data to pull, inclusive   (default: 1 day ago)
        last_run_date (datetime): last date that the pipeline was run for (default: 1 day ago)
        output_path   (str): the location where our files should be output (default: _OUTPUT_PATH)
    '''
from subprocess import check_output
from collections import Counter

_OVERLAP = 2  # INCREASE THIS IF GOOGLE STARTS BACK POPULATING DATA

_PIPELINE_PATH = path.dirname(path.realpath(__file__)) + '/'
_DATA_PATH = _PIPELINE_PATH + 'data/'

_TMP_CSV = _DATA_PATH + '.google_play_tmp.csv'
_LATEST_CSV = _DATA_PATH + 'google_play_latest.csv'
_ALL_CSV = _DATA_PATH + 'google_play_all.csv'
_REVIEW_FILE_PATTERN = environ['GS_PLAY_BUCKET'] + \
        '/reviews/reviews_org.mozilla.*%s.csv'

#This should be handled better...
_SENTIMENT_DB = Db('sentiment')

#FLAGS = gflags.FLAGS
#
#gflags.DEFINE_integer('my_version', 0, 'Version number.')
#gflags.DEFINE_string('filename', None, 'Input file name', short_name='f')
#
#gflags.RegisterValidator('my_version',
#                        lambda value: value % 2 == 0,
#                        message='--my_version must be divisible by 2')
#gflags.MarkFlagAsRequired('filename')
#
#


def main():
def update(product='both',
           start_date=(date.today() - timedelta(days=15)).strftime('%Y-%m-%d'),
           end_date=(date.today() - timedelta(days=1)).strftime('%Y-%m-%d'),
           ua_db=None,
           input_db=None,
           sumo_db=None):

    if not ua_db:
        ua_db = UA_DB('sentiment', is_persistent=True)
        ua_db.execute_sql('SET autocommit=1;')
    if not input_db:
        input_db = Input_DB(is_persistent=True)
    if not sumo_db:
        sumo_db = Sumo_DB(is_persistent=True)

    adis_file = _DATA_PATH + '.' + product + '_adis.tsv'
    input_file = _DATA_PATH + '.' + product + '_adis.tsv'
    visits_file = _DATA_PATH + '.' + product + '_visits.tsv'

    params = {'start_date': start_date, 'end_date': end_date}

    # =========== Create tables ============================================
    _execute_query(ua_db, _CREATE_SQL_FILE, params=params, multi=True)

    # =========== Parse heartbeat data =========================================
    #TODO(rrayborn): should this be its own pipeline?

    # Fetch Input data
    data = _execute_query(input_db, _HEARTBEAT_SQL_FILE, params=params)

    # Insert Input data
    header = data.keys()
    ua_db.insert_data_into_table(data,
                                 'daily_heartbeat_stats',
                                 header,
                                 has_header=False,
                                 is_replace=True)

    # =========== Parse input data =============================================

    # Fetch Input data
    data = _execute_query(input_db, _INPUT_SQL_FILE, params=params)

    # Create tmp_input table
    query = '''CREATE TEMPORARY TABLE tmp_input (
                `date`                     DATE,
                `version`                  INT,
                `is_desktop`               BOOL,
                `input_average`            FLOAT,
                `input_volume`             INT,
                `heartbeat_average`        FLOAT,
                `heartbeat_surveyed_users` INT,
                `heartbeat_volume`         INT
            );'''
    ua_db.execute_sql(query)

    # Insert Input data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'tmp_input', header, has_header=False)

    # =========== Create base table ============================================
    _execute_query(ua_db, _BASE_SQL_FILE, params=params)

    # =========== Parse sumo data =============================================

    # Fetch Sumo data
    data = _execute_query(sumo_db, _SUMO_SQL_FILE, params=params)

    # Create tmp_sumo table
    query = '''CREATE TEMPORARY TABLE tmp_sumo (
                `date`                     DATE,
                `version`                  INT,
                `is_desktop`               BOOL,
                `num_unanswered_72`        INT,
                `num_posts`                INT
            );'''
    ua_db.execute_sql(query)

    # Insert Sumo data
    header = data.keys()
    ua_db.insert_data_into_table(data, 'tmp_sumo', header, has_header=False)

    # =========== Parse ADI data ===============================================

    # Generate query
    # TODO(rrayborn): Need to investigate why part of the end date is missing.
    #                 Doesn't seem to affect the start_date...
    today_minus_three = (date.today() - timedelta(days=3)).strftime('%Y-%m-%d')
    adi_end_date = min(today_minus_three, end_date)
    with open(_ADIS_SQL_FILE, 'r') as adis_sql:
        query = adis_sql.read().replace('\n',
                                        '  ') % (start_date, adi_end_date)

    # Generate/execute command line
    cmd = 'echo "%s" | isql -v metrics_dsn  -b -x0x09  >%s'  # | tail -n+10'
    check_output(cmd % (query, adis_file), shell=True)

    # Create tmp table
    query = '''CREATE TEMPORARY TABLE tmp_adis (
                `date`                DATE,
                version               INT,
                is_desktop            BOOL,
                num_adis              INT
            );'''
    ua_db.execute_sql(query)

    header = ['date', 'version', 'is_desktop', 'num_adis']

    ua_db.insert_csv_into_table(adis_file, 'tmp_adis', header, delimiter='\t')

    # =========== Parse Analytics data =========================================

    # Get Google analytics data
    google_analytics.generate_inproduct(db=ua_db,
                                        device_type=product,
                                        filename=visits_file,
                                        start_date=start_date,
                                        end_date=end_date)

    # Create tmp table
    query = '''CREATE TEMPORARY TABLE tmp_sumo_visits (
                `date`      DATE, 
                version     INT,
                is_desktop  BOOL,
                visits      INT
            );'''
    ua_db.execute_sql(query)

    header = ['date', 'version', 'is_desktop', 'visits']
    ua_db.insert_csv_into_table(visits_file,
                                'tmp_sumo_visits',
                                header,
                                delimiter='\t')

    # =========== Parse Play data ==============================================

    query = '''CREATE TEMPORARY TABLE tmp_play AS 
        SELECT
            `date`, 
            version,
            AVG(rating) AS play_average,
            COUNT(*)    AS play_volume
        FROM google_play_reviews
        WHERE
                `date` >= :start_date
            AND `date` <= :end_date
        GROUP BY 1,2;
    '''
    ua_db.execute_sql(query, params)

    # =========== Run Stats query ==============================================
    query_files = []
    if product == 'both' or product == 'desktop':
        query_files.append(_DESKTOP_FILE_PATTERN)
    if product == 'both' or product == 'mobile':
        query_files.append(_MOBILE_FILE_PATTERN)

    for query_file in query_files:
        _execute_query(ua_db, query_file, params=params)