def main(): """Returns the number of fetches that resulted in an error.""" options = get_cmd_line_args() start_dt = date_util.from_date_iso(options.start_date) end_dt = date_util.from_date_iso(options.end_date) interval = int(options.interval) max_retries = int(options.max_retries) num_errors = 0 while start_dt < end_dt: next_dt = min(start_dt + datetime.timedelta(seconds=interval), end_dt) print >> sys.stderr, '[%s] Fetching logs from [%s, %s)...' % ( datetime.datetime.now(), start_dt, next_dt) for tries in xrange(max_retries): try: if options.backend: response = fetch_appengine_logs(start_dt, next_dt, "backend", None) elif options.appengine_version: response = fetch_appengine_logs(start_dt, next_dt, None, options.appengine_version) else: response = fetch_appengine_logs(start_dt, next_dt, "frontend", None) except Exception, why: sleep_secs = 2**tries print >> sys.stderr, ('ERROR: %s.\n' 'Retrying in %s seconds...' % (why, sleep_secs)) time.sleep(sleep_secs) else: # The 'header' portion of the response goes into the # fetch-log. The rest goes into the actual logs. (headers, body) = _split_into_headers_and_body(response) sys.stderr.write(headers) # It's nice to give a brief summary of what the logs are like. print >> sys.stderr, ('%s request lines found' % _num_requests_in_logs(body)) if not body: print >> sys.stderr, 'WARNING: No logs found' print body, break else: # for/else: if we get here, we never succeeded in fetching num_errors += 1 print >> sys.stderr, ( 'SKIPPING logs from %s to %s: error fetching.' % (start_dt, next_dt)) start_dt = next_dt
def main(): """Returns the number of fetches that resulted in an error.""" options = get_cmd_line_args() start_dt = date_util.from_date_iso(options.start_date) end_dt = date_util.from_date_iso(options.end_date) interval = int(options.interval) max_retries = int(options.max_retries) num_errors = 0 while start_dt < end_dt: next_dt = min(start_dt + datetime.timedelta(seconds=interval), end_dt) print >>sys.stderr, '[%s] Fetching logs from [%s, %s)...' % ( datetime.datetime.now(), start_dt, next_dt) for tries in xrange(max_retries): try: if options.backend: response = fetch_appengine_logs(start_dt, next_dt, "backend", None) elif options.appengine_version: response = fetch_appengine_logs(start_dt, next_dt, None, options.appengine_version) else: response = fetch_appengine_logs(start_dt, next_dt, "frontend", None) except Exception, why: sleep_secs = 2 ** tries print >>sys.stderr, ('ERROR: %s.\n' 'Retrying in %s seconds...' % (why, sleep_secs)) time.sleep(sleep_secs) else: # The 'header' portion of the response goes into the # fetch-log. The rest goes into the actual logs. (headers, body) = _split_into_headers_and_body(response) sys.stderr.write(headers) # It's nice to give a brief summary of what the logs are like. print >>sys.stderr, ('%s request lines found' % _num_requests_in_logs(body)) if not body: print >>sys.stderr, 'WARNING: No logs found' print body, break else: # for/else: if we get here, we never succeeded in fetching num_errors += 1 print >>sys.stderr, ('SKIPPING logs from %s to %s: error fetching.' % (start_dt, next_dt)) start_dt = next_dt
def main(): options = get_cmd_line_args() end_dt = date_util.from_date_iso(options.end_date) start_dt = date_util.from_date_iso(options.start_date) entity_list = download_entities(options.type, options.is_ndb, start_dt, end_dt, int(options.interval), int(options.max_logs), int(options.max_retries), options.key) with open(options.output_file, 'wb') as f: pickle.dump(entity_list, f) print >> sys.stderr, ("Downloaded and wrote %d entities. Exiting." % len(entity_list))
def main(): options = get_cmd_line_args() config = util.load_unstripped_json(options.config) g_logger.info("Fetching failed jobs from progress db") start_dt = date_util.from_date_iso(options.start_date) mongo = gae_download.open_db_conn(config) coordinator_cfg = config["coordinator_cfg"] # Don't touch tasks that was recently started # TODO(yunfang): parameterize this two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2) results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg) if not results: g_logger.info("Empty result set. Nothing to reprocess.") exit(0) for rec in results: if rec["history"]["1"] < start_dt: continue if rec["history"]["1"] >= two_hours_ago: # Started less than 2 hours ago continue # Reprocess fetch_interval = config['kinds'][rec['kind']][1] gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"], rec["end_dt"], fetch_interval, config) g_logger.info("Done reprocessing!!")
def get_video_logs_by_user(day_str): """Get data from mongo""" # TODO(yunfang): parameterize the db parameters allusers = {} global vid2title vlog_collection = pymongo.Connection(port=12345)['kadb_vl']['VideoLog'] iso_str = "%sT00:00:00Z" % day_str day = date_util.from_date_iso(iso_str) query = {"backup_timestamp": {"$gte": day, "$lt": day + datetime.timedelta(days=1)} } g_logger.info("Processing VideoLog for %s" % day_str) num_recs = 0; for rec in vlog_collection.find(query): # Compress the rec to a 3-tuple to save some space simp_rec = (rec['youtube_id'], rec['seconds_watched'], rec['is_video_completed']) user = rec['user'] vid2title[rec['youtube_id']] = rec['video_title'] if user not in allusers: allusers[user] = [] allusers[user].append(simp_rec) num_recs += 1 if num_recs % 10000 == 0: g_logger.info("Processing %s db records" % num_recs) g_logger.info("DB data downloaded %s # records: %s" % (day_str, num_recs)) return allusers
def main(): options = get_cmd_line_args() config = load_unstripped_json(options.config) for key in DEFAULT_DOWNLOAD_SETTINGS.keys(): if key not in config: config[key] = DEFAULT_DOWNLOAD_SETTINGS[key] if options.start_date and options.end_date: start_dt = date_util.from_date_iso(options.start_date) end_dt = date_util.from_date_iso(options.end_date) else: ts = time.time() end_ts = ts - (ts % int(options.proc_interval)) start_ts = end_ts - int(options.proc_interval) start_dt = dt.datetime.fromtimestamp(start_ts) end_dt = dt.datetime.fromtimestamp(end_ts) if options.archive_dir: # Override the archive directory, if specified. config['archive_dir'] = options.archive_dir start_data_process(config, start_dt, end_dt)
def main(): options = get_cmd_line_args() end_dt = date_util.from_date_iso(options.end_date) start_dt = date_util.from_date_iso(options.start_date) interval = int(options.interval) entity_list = [] while start_dt < end_dt: next_dt = min(start_dt + dt.timedelta(seconds=interval), end_dt) response = attempt_fetch_entities( options.type, start_dt, next_dt, int(options.max_logs), int(options.max_retries) ) entity_list += pickle.loads(response) start_dt = next_dt with open(options.output_file, "wb") as f: pickle.dump(entity_list, f) print >>sys.stderr, ("Downloaded and wrote %d entities. Exiting." % len(entity_list))
def main(): options = get_cmd_line_args() end_dt = date_util.from_date_iso(options.end_date) start_dt = date_util.from_date_iso(options.start_date) interval = int(options.interval) entity_list = [] while start_dt < end_dt: next_dt = min(start_dt + dt.timedelta(seconds=interval), end_dt) response = attempt_fetch_entities(options.type, start_dt, next_dt, int(options.max_logs), int(options.max_retries)) entity_list += pickle.loads(response) start_dt = next_dt with open(options.output_file, 'wb') as f: pickle.dump(entity_list, f) print >> sys.stderr, ("Downloaded and wrote %d entities. Exiting." % len(entity_list))
def populate(summary, day_str): """Populate the data to the report db""" # TODO(yunfang): parameterize this thing report_db = pymongo.Connection('184.73.72.110')['report'] report_collection = report_db['daily_video_stats'] g_logger.info("Populating data") iso_str = "%sT00:00:00Z" % day_str day = date_util.from_date_iso(iso_str) for ucat, vid2stats in summary.iteritems(): for vid, stats in vid2stats.iteritems(): if vid in vid2title: title = vid2title[vid] else: title = 'total' doc = {"day": day, "date_str": day_str, "ucat": ucat, "vid": vid, 'vtitle':title} doc.update(stats) doc["_id"] = "%s-%s-%s" % (ucat, vid, day_str) report_collection.save(doc) g_logger.info("Done...")
def main(): """Returns the number of fetches that resulted in an error.""" options = get_cmd_line_args() start_dt = date_util.from_date_iso(options.start_date) end_dt = date_util.from_date_iso(options.end_date) interval = int(options.interval) max_retries = int(options.max_retries) appengine_versions = [options.appengine_version] if options.file_for_alternate_appengine_versions: appengine_versions.extend( _read_versions(options.file_for_alternate_appengine_versions)) if options.backend: try: # if the 'default' version is in the list, it isn't valid # for backends appengine_versions.remove(None) except ValueError: pass appengine_versions = ["mapreducebackend-" + v for v in appengine_versions] print >>sys.stderr, ('Looking at these appengine versions: %s' % [v or '(default)' for v in appengine_versions]) num_errors = 0 while start_dt < end_dt: next_dt = min(start_dt + datetime.timedelta(seconds=interval), end_dt) print >>sys.stderr, '[%s] Fetching logs from [%s, %s)...' % ( datetime.datetime.now(), start_dt, next_dt) for tries in xrange(max_retries): try: response = fetch_appengine_logs(start_dt, next_dt, appengine_versions) except Exception, why: sleep_secs = 2 ** tries print >>sys.stderr, ('ERROR: %s.\n' 'Retrying in %s seconds...' % (why, sleep_secs)) time.sleep(sleep_secs) else: # The 'header' portion of the response goes into the # fetch-log. The rest goes into the actual logs. (headers, body) = _split_into_headers_and_body(response) sys.stderr.write(headers) # It's nice to give a brief summary of what the logs are like. print >>sys.stderr, ('%s request lines found' % _num_requests_in_logs(body)) if not body: print >>sys.stderr, 'WARNING: No logs found' print body, break else: # for/else: if we get here, we never succeeded in fetching num_errors += 1 print >>sys.stderr, ('SKIPPING logs from %s to %s: error fetching.' % (start_dt, next_dt)) start_dt = next_dt