def main(): """Verification of batching/anonymization script. Asserts: * Expected number of batch files exist for both public and private collections. * No extra batch files exist for both public and private collections. * All of the batch files are part on the current run. * Number of events is consistent between public and private, and matches up with upstream counts * No sensitive fields exists in public collections. """ run_id = utils.get_history_run_id_for('transform02') complaints_file = utils.get_complaints_for('transform02', 'w') complaints_file.write(settings.RUN_HEADER + '{}\n'.format(run_id)) batch_count = utils.get_batch_count() complaints = 0 print('Validating private data') complaints += verify_files('private', batch_count, run_id, complaints_file) print('Validating public data') complaints += verify_files('public', batch_count, run_id, complaints_file) if complaints > 0: print("This is {}.\n\nThat's {} {}!".format( ', '.join(['gross'] * complaints), complaints, 'whole "gross"' if complaints == 1 else '"grosses"' )) else: print("You've passed the final challenge! Huzzah, brave warrior!")
def main(): input_filename = '/'.join([ utils.get_dir_for('extract'), settings.EXTRACT_FILE, ]) input_file = open(input_filename, 'r') run_id = utils.get_history_run_id_for('extract') complaints_file = utils.get_complaints_for('extract', 'w') complaints_file.write('Run ID: {}\n'.format(run_id)) linenum = 0 complaints = 0 for pageview_json in input_file.readlines(): linenum += 1 pageview = json.loads(pageview_json) visit = pageview['visit'] action = pageview['action'] # ip address are all scrubbed? if not re.search('0\.0$', visit['ip_addr']): complaints += 1 complaints_file.write( 'Line {}, ID {}: unscrubbed ip address! ({})\n'.format( linenum, action['id'], visit['ip_addr'])) if not action['page']['url']: complaints += 1 complaints_file.write( 'Line {}, ID {}: page url is missing!\n'.format( linenum, action['id'])) elif re.match('https?:\/\/', action['page']['url']): complaints += 1 complaints_file.write( 'Line {}, ID {}: page url includes domain! ({})\n'.format( linenum, action['id'], action['page']['url'].encode('utf-8'))) if complaints > 0: print("You've got {} problems, but a ready-to-go migration ain't one!". format(complaints)) else: print("Looks good. How'd you manage that?")
def main(): input_filename = '/'.join([utils.get_dir_for('transform01'), settings.TRANSFORM01_FILE,]) input_file = open(input_filename, 'r') run_id = utils.get_history_run_id_for('transform01') complaints_file = utils.get_complaints_for('transform01', 'w') complaints_file.write('Run ID: {}\n'.format(run_id)) linenum = 0 complaints = 0 for pageview_json in input_file.readlines(): linenum += 1 if not linenum % 100: print('Validating line {}'.format(linenum)) pageview = json.loads(pageview_json) if pageview['page']['url'] is None: complaints += 1 complaints_file.write('Line {}: empty url!\n'.format(linenum)) # if pageview['page']['title'] is None: # complaints += 1 # complaints_file.write('Line {}: empty page title!\n'.format(linenum)) if pageview['time']['utc'] is None: complaints += 1 complaints_file.write('Line {}: missing timestamp!\n'.format(linenum)) if pageview['tech']['ip'] is not None: if pageview['anon']['continent'] is None or pageview['anon']['country'] is None: complaints += 1 complaints_file.write( 'Line {}: Have IP addr ({}), but missing continent and/or country: ({} / {})\n'.format( linenum, pageview['tech']['ip'], pageview['anon']['continent'] or 'None', pageview['anon']['country'] or 'None' ) ) if complaints > 0: print("I got {} reasons to be mad at you. ".format(complaints)) else: print("You've done your homework, have a cookie!");
def main(): run_id = utils.get_history_run_id_for('transform01') complaints_file = utils.get_complaints_for('transform01', 'w') complaints_file.write('Run ID: {}\n'.format(run_id)) linenum = 0 complaints = 0 with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file: for i, pageview_json in enumerate(input_file): linenum = i + 1 if not linenum % 100: print('Validating line {}'.format(linenum)) pageview = json.loads(pageview_json) if pageview['page']['url'] is None: complaints += 1 complaints_file.write('Line {}: empty url!\n'.format(linenum)) if pageview['time']['utc'] is None: complaints += 1 complaints_file.write('Line {}: missing timestamp!\n'.format(linenum)) if pageview['tech']['ip'] is not None: if pageview['anon']['continent'] is None or pageview['anon']['country'] is None: complaints += 1 complaints_file.write( 'Line {}: Have IP addr ({}), but missing continent and/or country: ({} / {})\n'.format( linenum, pageview['tech']['ip'], pageview['anon']['continent'] or 'None', pageview['anon']['country'] or 'None' ) ) if complaints > 0: print("I got {} reasons to be mad at you. ".format(complaints)) else: print("You've done your homework, have a cookie!");
def main(dry_run=True, batch_count=None, force=False): """Upload the pageviews to Keen. """ history_run_id = utils.get_history_run_id_for('transform02') complaints_run_id = utils.get_complaints_run_id_for('transform02') if history_run_id != complaints_run_id: print("You need to validate your first-phase transformed data! Bailing...") sys.exit() extract_complaints = utils.get_complaints_for('transform02', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your second-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('load', 'a') history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id)) history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow())) keen_clients = {'public': None, 'private': None} es_client = None if dry_run: print("Doing dry-run upload to Elastic search. Pass --for-reals to upload to Keen") es_client = Elasticsearch() try: es_client.indices.delete(script_settings.ES_INDEX) except Exception as exc: print(exc) pass else: keen_clients = { 'public': KeenClient( project_id=settings.KEEN['public']['project_id'], write_key=settings.KEEN['public']['write_key'], ), 'private': KeenClient( project_id=settings.KEEN['private']['project_id'], write_key=settings.KEEN['private']['write_key'], ) } tally = {} seen = {} try: with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log: for seen_file in resume_log.readlines(): seen[seen_file.strip('\n')] = 1 except: pass batch_count = utils.get_batch_count() if batch_count is None else batch_count print("Beginning Upload") with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log: for batch_id in range(1, batch_count+1): print(" Batch {}".format(batch_id)) for domain in ('private', 'public'): print(" Domain: {}".format(domain)) file_id = '{}-{}'.format(domain, batch_id) if file_id in seen.keys(): print(" ...seen, skipping.\n") continue history_file.write('Uploading for {} project, batch {}'.format(domain, batch_id)) load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain]) resume_log.write('{}\n'.format(file_id)) history_file.write(' ...finished\n') print("Finished Upload") history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow())) history_file.write('Tally was:\n') for k, v in sorted(tally.items()): history_file.write(' {}: {}\n'.format(k, v))
def main(force=False): history_run_id = utils.get_history_run_id_for('extract') complaints_run_id = utils.get_complaints_run_id_for('extract') if history_run_id != complaints_run_id: print('You need to validate your exported data! Bailing...') sys.exit() extract_complaints = utils.get_complaints_for('extract', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print('You have unaddressed complaints!') if not force: print(' ...pass --force to ignore') sys.exit() extract_complaints.close() sqlite_db = sqlite3.connect(settings.SQLITE_PATH) sqlite_db.row_factory = sqlite3.Row sqlite_setup(sqlite_db) transform_dir = utils.get_dir_for('transform01') logger.info('Run ID: {}\n'.format(complaints_run_id)) logger.info('Beginning extraction at: {}Z\n'.format(datetime.utcnow())) tally = {'missing_user': 0, 'missing_node': 0} lastline = 0 try: with open(utils.get_dir_for('transform01') + '/resume.log', 'r') as fp: fp.seek(-32, 2) lastline = int(fp.readlines()[-1].strip('\n')) except IOError: pass with open(utils.get_dir_for('transform01') + '/resume.log', 'a', 0) as resume_file: # Pass 0 for unbuffered writing with open(transform_dir + '/' + settings.TRANSFORM01_FILE, 'a') as output_file: with open(utils.get_dir_for('extract') + '/' + settings.EXTRACT_FILE, 'r') as input_file: print('Lastline is: {}\n'.format(lastline)) for i, pageview_json in enumerate(input_file): linenum = i + 1 if linenum <= lastline: if not linenum % 1000: print('Skipping line {} of ***{}***'.format(linenum, lastline)) continue if not linenum % 1000: print('Transforming line {}'.format(linenum)) raw_pageview = json.loads(pageview_json) visit = raw_pageview['visit'] action = raw_pageview['action'] # lookup location by ip address. piwik strips last 16 bits, so may not be completely # accurate, but should be close enough. ip_addr = visit['ip_addr'] location = get_location_for_ip_addr(ip_addr, sqlite_db) # user has many visitor ids, visitor id has many session ids. # in keen, visitor id will refresh 1/per year, session 1/per 30min. visitor_id = get_or_create_visitor_id(visit['visitor_id'], sqlite_db) session_id = get_or_create_session_id(visit['id'], sqlite_db) user_id = visit['user_id'] user = get_or_create_user(user_id, sqlite_db) node_id = action['node_id'] node = get_or_create_node(node_id, sqlite_db) browser_version = [None, None] if visit['ua']['browser']['version']: browser_version = visit['ua']['browser']['version'].split('.') os_version = [None, None] if visit['ua']['os_version']: os_version = visit['ua']['os_version'].split('.') if len(os_version) == 1: os_version.append(None) os_family = parse_os_family(visit['ua']['os']); if visit['ua']['os'] == 'WIN' and visit['ua']['os_version']: os_family = os_family.replace('<Unknown Version>', visit['ua']['os_version']) browser_info = { 'device': { 'family': visit['ua']['device'], }, 'os': { 'major': os_version[0], 'patch_minor': None, 'minor': os_version[1], 'family': os_family, 'patch': None, }, 'browser': { 'major': browser_version[0], 'minor': browser_version[1], 'family': parse_browser_family(visit['ua']['browser']['name']), 'patch': None, }, } if '-' in visit['ua']['browser']['locale']: browser_locale = visit['ua']['browser']['locale'].split('-') browser_language = '-'.join([browser_locale[0], browser_locale[1].upper()]) node_tags = None if action['node_tags'] is None else [ tag for tag in action['node_tags'].split(',') ] # piwik stores resolution as 1900x600 mostly, but sometimes as a float? # For the sake of my sanity and yours, let's ignore floats. screen_resolution = (None, None) if re.search('x', visit['ua']['screen']): screen_resolution = visit['ua']['screen'].split('x') # piwik fmt: '2016-05-11 20:30:00', keen fmt: '2016-06-30T17:12:50.070Z' # piwik is always utc utc_timestamp = datetime.strptime(action['timestamp'], '%Y-%m-%d %H:%M:%S') utc_ts_formatted = utc_timestamp.isoformat() + '.000Z' # naive, but correct local_timedelta = timedelta(minutes=visit['tz_offset']) local_timestamp = utc_timestamp + local_timedelta pageview = { 'meta': { 'epoch': 0, # migrated from piwik }, 'page': { 'title': action['page']['title'], 'url': action['page']['url_prefix'] + action['page']['url'] if action['page']['url'] is not None else None, 'info': {} # (add-on) }, 'referrer': { 'url': action['referrer'] or None, 'info': {}, # (add-on) }, 'tech': { 'browser': { # JS-side will be filled in by Keen.helpers.getBrowserProfile() 'cookies': True if visit['ua']['browser']['cookies'] else False, 'language': browser_language, 'screen': { 'height': screen_resolution[1], 'width': screen_resolution[0], }, }, 'ip': ip_addr, # private 'ua': None, 'info': browser_info, }, 'time': { 'utc': timestamp_components(utc_timestamp), 'local': timestamp_components(local_timestamp), }, 'visitor': { 'id': visitor_id, 'session': session_id, 'returning': True if visit['visitor_returning'] else False, # visit }, 'user': { 'id': user_id, 'entry_point': '' if user is None else user['entry_point'], # empty string if no user 'locale': '' if user is None else user['locale'], # empty string if no user 'timezone': '' if user is None else user['timezone'], # empty string if no user 'institutions': None if user is None else user['institutions'], # null if no user, else [] }, 'node': { 'id': node_id, 'title': None if node is None else node['title'], 'type': None if node is None else node['category'], 'tags': node_tags, 'made_public_date': None if node is None else node['made_public_date'], }, 'geo': {}, 'anon': { 'id': md5(session_id).hexdigest(), 'continent': None if location is None else location['continent'], 'country': None if location is None else location['country'], }, 'keen': { 'timestamp': utc_ts_formatted, 'addons': [ { 'name': 'keen:referrer_parser', 'input': { 'referrer_url': 'referrer.url', 'page_url': 'page.url' }, 'output': 'referrer.info' }, { 'name': 'keen:url_parser', 'input': { 'url': 'page.url' }, 'output': 'page.info' }, { 'name': 'keen:url_parser', 'input': { 'url': 'referrer.url' }, 'output': 'referrer.info' }, { # private 'name': 'keen:ip_to_geo', 'input': { 'ip': 'tech.ip' }, 'output': 'geo', } ], } } if node_id is None: tally['missing_node'] += 1 if user_id is None: tally['missing_user'] += 1 output_file.write(json.dumps(pageview) + '\n') resume_file.write(str(linenum) + '\n') logger.info('Finished extraction at: {}Z\n'.format(datetime.utcnow())) logger.info('Final count was: {}\n'.format(linenum)) logger.info('{} pageviews lacked a user id.\n'.format(tally['missing_user'])) logger.info('{} pageviews lacked a node id.\n'.format(tally['missing_node'])) sqlite_db.close()
def main(force=False): history_run_id = utils.get_history_run_id_for('transform01') complaints_run_id = utils.get_complaints_run_id_for('transform01') if history_run_id != complaints_run_id: print("You need to validate your first-phase transformed data! Bailing...") sys.exit() extract_complaints = utils.get_complaints_for('transform01', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your first-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('transform02', 'w') history_file.write('Run ID: {}\n'.format(complaints_run_id)) history_file.write('Beginning extraction at: {}Z\n'.format(datetime.utcnow())) transform_dir = utils.get_dir_for('transform02') public_template = transform_dir + '/public-{0:04d}.data' private_template = transform_dir + '/private-{0:04d}.data' lastline = 0 try: with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp: fp.seek(-32, 2) lastline = int(fp.readlines()[-1].strip('\n')) except IOError: pass linenum = 0 batchnum = 0 public_pageviews = [] private_pageviews = [] with open(transform_dir + '/resume.log', 'a', 0) as resume_file: # Pass 0 for unbuffered writing with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file: print('Lastline is: {}\n'.format(lastline)) for i, pageview_json in enumerate(input_file): linenum = i + 1 if linenum <= lastline: if not linenum % 1000: print('Skipping line {} of ***{}***'.format(linenum, lastline)) continue if not linenum % 1000: print('Batching line {}'.format(linenum)) pageview = json.loads(pageview_json) made_public_date = pageview['node']['made_public_date'] del pageview['node']['made_public_date'] private_pageviews.append(pageview) # only pageviews logged after the most recent make public date are copied to public # collection if made_public_date is not None and made_public_date < pageview['keen']['timestamp']: public_pageview = copy.deepcopy(pageview) for private_property in ('tech', 'user', 'visitor', 'geo' ): del public_pageview[private_property] for addon in public_pageview['keen']['addons']: if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'): public_pageview['keen']['addons'].remove(addon) public_pageviews.append(public_pageview) if linenum % settings.BATCH_SIZE == 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) if linenum % settings.BATCH_SIZE != 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
def main(dry_run=True, batch_count=None, force=False): """Upload the pageviews to Keen. """ history_run_id = utils.get_history_run_id_for('transform02') complaints_run_id = utils.get_complaints_run_id_for('transform02') if history_run_id != complaints_run_id: print( "You need to validate your first-phase transformed data! Bailing..." ) sys.exit() extract_complaints = utils.get_complaints_for('transform02', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print( "You have unaddressed complaints in your second-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('load', 'a') history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id)) history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow())) keen_clients = {'public': None, 'private': None} es_client = None if dry_run: print( "Doing dry-run upload to Elastic search. Pass --for-reals to upload to Keen" ) es_client = Elasticsearch() try: es_client.indices.delete(script_settings.ES_INDEX) except Exception as exc: print(exc) pass else: keen_clients = { 'public': KeenClient( project_id=settings.KEEN['public']['project_id'], write_key=settings.KEEN['public']['write_key'], ), 'private': KeenClient( project_id=settings.KEEN['private']['project_id'], write_key=settings.KEEN['private']['write_key'], ) } tally = {} seen = {} try: with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log: for seen_file in resume_log.readlines(): seen[seen_file.strip('\n')] = 1 except: pass batch_count = utils.get_batch_count( ) if batch_count is None else batch_count print("Beginning Upload") with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log: for batch_id in range(1, batch_count + 1): print(" Batch {}".format(batch_id)) for domain in ('private', 'public'): print(" Domain: {}".format(domain)) file_id = '{}-{}'.format(domain, batch_id) if file_id in seen.keys(): print(" ...seen, skipping.\n") continue history_file.write('Uploading for {} project, batch {}'.format( domain, batch_id)) load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain]) resume_log.write('{}\n'.format(file_id)) history_file.write(' ...finished\n') print("Finished Upload") history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow())) history_file.write('Tally was:\n') for k, v in sorted(tally.items()): history_file.write(' {}: {}\n'.format(k, v))
def main(force=False): history_run_id = utils.get_history_run_id_for('transform01') complaints_run_id = utils.get_complaints_run_id_for('transform01') if history_run_id != complaints_run_id: print( "You need to validate your first-phase transformed data! Bailing..." ) sys.exit() extract_complaints = utils.get_complaints_for('transform01', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your first-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('transform02', 'w') history_file.write('Run ID: {}\n'.format(complaints_run_id)) history_file.write('Beginning extraction at: {}Z\n'.format( datetime.utcnow())) transform_dir = utils.get_dir_for('transform02') public_template = transform_dir + '/public-{0:04d}.data' private_template = transform_dir + '/private-{0:04d}.data' lastline = 0 try: with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp: fp.seek(-32, 2) lastline = int(fp.readlines()[-1].strip('\n')) except IOError: pass linenum = 0 batchnum = 0 public_pageviews = [] private_pageviews = [] with open(transform_dir + '/resume.log', 'a', 0) as resume_file: # Pass 0 for unbuffered writing with open( utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file: print('Lastline is: {}\n'.format(lastline)) for i, pageview_json in enumerate(input_file): linenum = i + 1 if linenum <= lastline: if not linenum % 1000: print('Skipping line {} of ***{}***'.format( linenum, lastline)) continue if not linenum % 1000: print('Batching line {}'.format(linenum)) pageview = json.loads(pageview_json) made_public_date = pageview['node']['made_public_date'] del pageview['node']['made_public_date'] private_pageviews.append(pageview) # only pageviews logged after the most recent make public date are copied to public # collection if made_public_date is not None and made_public_date < pageview[ 'keen']['timestamp']: public_pageview = copy.deepcopy(pageview) for private_property in ('tech', 'user', 'visitor', 'geo'): del public_pageview[private_property] for addon in public_pageview['keen']['addons']: if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'): public_pageview['keen']['addons'].remove(addon) public_pageviews.append(public_pageview) if linenum % settings.BATCH_SIZE == 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) if linenum % settings.BATCH_SIZE != 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))