Example #1
0
def main():
    """Returns the number of fetches that resulted in an error."""
    options = get_cmd_line_args()

    start_dt = date_util.from_date_iso(options.start_date)
    end_dt = date_util.from_date_iso(options.end_date)
    interval = int(options.interval)
    max_retries = int(options.max_retries)

    num_errors = 0
    while start_dt < end_dt:
        next_dt = min(start_dt + datetime.timedelta(seconds=interval), end_dt)

        print >> sys.stderr, '[%s] Fetching logs from [%s, %s)...' % (
            datetime.datetime.now(), start_dt, next_dt)

        for tries in xrange(max_retries):
            try:
                if options.backend:
                    response = fetch_appengine_logs(start_dt, next_dt,
                                                    "backend", None)
                elif options.appengine_version:
                    response = fetch_appengine_logs(start_dt, next_dt, None,
                                                    options.appengine_version)
                else:
                    response = fetch_appengine_logs(start_dt, next_dt,
                                                    "frontend", None)
            except Exception, why:
                sleep_secs = 2**tries
                print >> sys.stderr, ('ERROR: %s.\n'
                                      'Retrying in %s seconds...' %
                                      (why, sleep_secs))
                time.sleep(sleep_secs)
            else:
                # The 'header' portion of the response goes into the
                # fetch-log.  The rest goes into the actual logs.
                (headers, body) = _split_into_headers_and_body(response)
                sys.stderr.write(headers)
                # It's nice to give a brief summary of what the logs are like.
                print >> sys.stderr, ('%s request lines found' %
                                      _num_requests_in_logs(body))
                if not body:
                    print >> sys.stderr, 'WARNING: No logs found'
                print body,
                break
        else:  # for/else: if we get here, we never succeeded in fetching
            num_errors += 1
            print >> sys.stderr, (
                'SKIPPING logs from %s to %s: error fetching.' %
                (start_dt, next_dt))

        start_dt = next_dt
Example #2
0
def main():
    """Returns the number of fetches that resulted in an error."""
    options = get_cmd_line_args()

    start_dt = date_util.from_date_iso(options.start_date)
    end_dt = date_util.from_date_iso(options.end_date)
    interval = int(options.interval)
    max_retries = int(options.max_retries)

    num_errors = 0
    while start_dt < end_dt:
        next_dt = min(start_dt + datetime.timedelta(seconds=interval), end_dt)

        print >>sys.stderr, '[%s] Fetching logs from [%s, %s)...' % (
            datetime.datetime.now(), start_dt, next_dt)

        for tries in xrange(max_retries):
            try:
                if options.backend:
                    response = fetch_appengine_logs(start_dt, next_dt,
                        "backend", None)
                elif options.appengine_version:
                    response = fetch_appengine_logs(start_dt, next_dt,
                        None, options.appengine_version)
                else:
                    response = fetch_appengine_logs(start_dt, next_dt,
                        "frontend", None)
            except Exception, why:
                sleep_secs = 2 ** tries
                print >>sys.stderr, ('ERROR: %s.\n'
                                     'Retrying in %s seconds...'
                                     % (why, sleep_secs))
                time.sleep(sleep_secs)
            else:
                # The 'header' portion of the response goes into the
                # fetch-log.  The rest goes into the actual logs.
                (headers, body) = _split_into_headers_and_body(response)
                sys.stderr.write(headers)
                # It's nice to give a brief summary of what the logs are like.
                print >>sys.stderr, ('%s request lines found'
                                     % _num_requests_in_logs(body))
                if not body:
                    print >>sys.stderr, 'WARNING: No logs found'
                print body,
                break
        else:  # for/else: if we get here, we never succeeded in fetching
            num_errors += 1
            print >>sys.stderr, ('SKIPPING logs from %s to %s: error fetching.'
                                 % (start_dt, next_dt))

        start_dt = next_dt
Example #3
0
def main():
    options = get_cmd_line_args()
    end_dt = date_util.from_date_iso(options.end_date)
    start_dt = date_util.from_date_iso(options.start_date)

    entity_list = download_entities(options.type, options.is_ndb, start_dt,
                                    end_dt, int(options.interval),
                                    int(options.max_logs),
                                    int(options.max_retries), options.key)

    with open(options.output_file, 'wb') as f:
        pickle.dump(entity_list, f)

    print >> sys.stderr, ("Downloaded and wrote %d entities.  Exiting." %
                          len(entity_list))
def main():
    options = get_cmd_line_args()
    config = util.load_unstripped_json(options.config)
    g_logger.info("Fetching failed jobs from progress db")
    start_dt = date_util.from_date_iso(options.start_date)
    mongo = gae_download.open_db_conn(config)
    coordinator_cfg = config["coordinator_cfg"]
    # Don't touch tasks that was recently started
    # TODO(yunfang): parameterize this
    two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2)
    results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg)
    if not results:
        g_logger.info("Empty result set. Nothing to reprocess.")
        exit(0)
    for rec in results:
        if rec["history"]["1"] < start_dt:
            continue
        if rec["history"]["1"] >= two_hours_ago:
            # Started less than 2 hours ago
            continue
        # Reprocess
        fetch_interval = config['kinds'][rec['kind']][1]
        gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"],
            rec["end_dt"], fetch_interval, config)
    g_logger.info("Done reprocessing!!")
Example #5
0
def get_video_logs_by_user(day_str): 
    """Get data from mongo"""
    # TODO(yunfang): parameterize the db parameters
    allusers = {}
    global vid2title
    vlog_collection = pymongo.Connection(port=12345)['kadb_vl']['VideoLog']
    iso_str = "%sT00:00:00Z" % day_str
    day = date_util.from_date_iso(iso_str)
    query = {"backup_timestamp": 
                {"$gte": day,
                 "$lt": day + datetime.timedelta(days=1)}
            }
    g_logger.info("Processing VideoLog for %s" % day_str)
    num_recs = 0;
    for rec in vlog_collection.find(query):
        # Compress the rec to a 3-tuple to save some space
        simp_rec = (rec['youtube_id'], rec['seconds_watched'],
                    rec['is_video_completed'])
        user = rec['user']
        vid2title[rec['youtube_id']] = rec['video_title']
        if user not in allusers:
            allusers[user] = []
        allusers[user].append(simp_rec)
        num_recs += 1
        if num_recs % 10000 == 0:
            g_logger.info("Processing %s db records" % num_recs)  
    g_logger.info("DB data downloaded %s # records: %s" % (day_str, num_recs))
    return allusers
Example #6
0
def main():
    options = get_cmd_line_args()
    config = util.load_unstripped_json(options.config)
    g_logger.info("Fetching failed jobs from progress db")
    start_dt = date_util.from_date_iso(options.start_date)
    mongo = gae_download.open_db_conn(config)
    coordinator_cfg = config["coordinator_cfg"]
    # Don't touch tasks that was recently started
    # TODO(yunfang): parameterize this
    two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2)
    results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg)
    if not results:
        g_logger.info("Empty result set. Nothing to reprocess.")
        exit(0)
    for rec in results:
        if rec["history"]["1"] < start_dt:
            continue
        if rec["history"]["1"] >= two_hours_ago:
            # Started less than 2 hours ago
            continue
        # Reprocess
        fetch_interval = config['kinds'][rec['kind']][1]
        gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"],
                                            rec["end_dt"], fetch_interval,
                                            config)
    g_logger.info("Done reprocessing!!")
def main():
    options = get_cmd_line_args()
    end_dt = date_util.from_date_iso(options.end_date)
    start_dt = date_util.from_date_iso(options.start_date)

    entity_list = download_entities(options.type,
                                    options.is_ndb,
                                    start_dt, end_dt,
                                    int(options.interval),
                                    int(options.max_logs),
                                    int(options.max_retries),
                                    options.key)

    with open(options.output_file, 'wb') as f:
        pickle.dump(entity_list, f)

    print >> sys.stderr, ("Downloaded and wrote %d entities.  Exiting." %
                          len(entity_list))
Example #8
0
def main():
    options = get_cmd_line_args()
    config = load_unstripped_json(options.config)
    for key in DEFAULT_DOWNLOAD_SETTINGS.keys():
        if key not in config:
            config[key] = DEFAULT_DOWNLOAD_SETTINGS[key]
    if options.start_date and options.end_date:
        start_dt = date_util.from_date_iso(options.start_date)
        end_dt = date_util.from_date_iso(options.end_date)
    else:
        ts = time.time()
        end_ts = ts - (ts % int(options.proc_interval))
        start_ts = end_ts - int(options.proc_interval)
        start_dt = dt.datetime.fromtimestamp(start_ts)
        end_dt = dt.datetime.fromtimestamp(end_ts)
    if options.archive_dir:
        # Override the archive directory, if specified.
        config['archive_dir'] = options.archive_dir
    start_data_process(config, start_dt, end_dt)
Example #9
0
def main():
    options = get_cmd_line_args()
    end_dt = date_util.from_date_iso(options.end_date)
    start_dt = date_util.from_date_iso(options.start_date)
    interval = int(options.interval)
    entity_list = []

    while start_dt < end_dt:
        next_dt = min(start_dt + dt.timedelta(seconds=interval), end_dt)
        response = attempt_fetch_entities(
            options.type, start_dt, next_dt, int(options.max_logs), int(options.max_retries)
        )
        entity_list += pickle.loads(response)
        start_dt = next_dt

    with open(options.output_file, "wb") as f:
        pickle.dump(entity_list, f)

    print >>sys.stderr, ("Downloaded and wrote %d entities.  Exiting." % len(entity_list))
Example #10
0
def main():
    options = get_cmd_line_args()
    config = load_unstripped_json(options.config)
    for key in DEFAULT_DOWNLOAD_SETTINGS.keys():
        if key not in config:
            config[key] = DEFAULT_DOWNLOAD_SETTINGS[key]
    if options.start_date and options.end_date:
        start_dt = date_util.from_date_iso(options.start_date)
        end_dt = date_util.from_date_iso(options.end_date)
    else:
        ts = time.time()
        end_ts = ts - (ts % int(options.proc_interval))
        start_ts = end_ts - int(options.proc_interval)
        start_dt = dt.datetime.fromtimestamp(start_ts)
        end_dt = dt.datetime.fromtimestamp(end_ts)
    if options.archive_dir:
        # Override the archive directory, if specified.
        config['archive_dir'] = options.archive_dir
    start_data_process(config, start_dt, end_dt)
Example #11
0
def main():
    options = get_cmd_line_args()
    end_dt = date_util.from_date_iso(options.end_date)
    start_dt = date_util.from_date_iso(options.start_date)
    interval = int(options.interval)
    entity_list = []

    while start_dt < end_dt:
        next_dt = min(start_dt + dt.timedelta(seconds=interval), end_dt)
        response = attempt_fetch_entities(options.type, start_dt, next_dt,
                                          int(options.max_logs),
                                          int(options.max_retries))
        entity_list += pickle.loads(response)
        start_dt = next_dt

    with open(options.output_file, 'wb') as f:
        pickle.dump(entity_list, f)

    print >> sys.stderr, ("Downloaded and wrote %d entities.  Exiting." %
                          len(entity_list))
Example #12
0
def populate(summary, day_str):
    """Populate the data to the report db"""
    # TODO(yunfang): parameterize this thing
    report_db = pymongo.Connection('184.73.72.110')['report']
    report_collection = report_db['daily_video_stats']
    g_logger.info("Populating data")
    iso_str = "%sT00:00:00Z" % day_str
    day = date_util.from_date_iso(iso_str)
    for ucat, vid2stats in summary.iteritems():
        for vid, stats in vid2stats.iteritems():  
            if vid in vid2title:
                title = vid2title[vid]
            else:
                title = 'total'    
            doc = {"day": day, "date_str": day_str, 
                   "ucat": ucat, "vid": vid, 'vtitle':title}
            doc.update(stats)
            doc["_id"] = "%s-%s-%s" % (ucat, vid, day_str)
            report_collection.save(doc)
    g_logger.info("Done...")
Example #13
0
def main():
    """Returns the number of fetches that resulted in an error."""
    options = get_cmd_line_args()

    start_dt = date_util.from_date_iso(options.start_date)
    end_dt = date_util.from_date_iso(options.end_date)
    interval = int(options.interval)
    max_retries = int(options.max_retries)
    appengine_versions = [options.appengine_version]
    if options.file_for_alternate_appengine_versions:
        appengine_versions.extend(
            _read_versions(options.file_for_alternate_appengine_versions))

    if options.backend:
        try:
            # if the 'default' version is in the list, it isn't valid
            # for backends
            appengine_versions.remove(None)
        except ValueError:
            pass
        appengine_versions = ["mapreducebackend-" + v 
                              for v in appengine_versions]
        
    print >>sys.stderr, ('Looking at these appengine versions: %s'
                         % [v or '(default)' for v in appengine_versions])

    num_errors = 0
    while start_dt < end_dt:
        next_dt = min(start_dt + datetime.timedelta(seconds=interval), end_dt)

        print >>sys.stderr, '[%s] Fetching logs from [%s, %s)...' % (
            datetime.datetime.now(), start_dt, next_dt)

        for tries in xrange(max_retries):
            try:
                response = fetch_appengine_logs(start_dt, next_dt,
                                                appengine_versions)
            except Exception, why:
                sleep_secs = 2 ** tries
                print >>sys.stderr, ('ERROR: %s.\n'
                                     'Retrying in %s seconds...'
                                     % (why, sleep_secs))
                time.sleep(sleep_secs)
            else:
                # The 'header' portion of the response goes into the
                # fetch-log.  The rest goes into the actual logs.
                (headers, body) = _split_into_headers_and_body(response)
                sys.stderr.write(headers)
                # It's nice to give a brief summary of what the logs are like.
                print >>sys.stderr, ('%s request lines found'
                                     % _num_requests_in_logs(body))
                if not body:
                    print >>sys.stderr, 'WARNING: No logs found'
                print body,
                break
        else:  # for/else: if we get here, we never succeeded in fetching
            num_errors += 1
            print >>sys.stderr, ('SKIPPING logs from %s to %s: error fetching.'
                                 % (start_dt, next_dt))

        start_dt = next_dt