コード例 #1
0
ファイル: metric_writer.py プロジェクト: lbracken/news_data
def create_metrics_for_article(article_id, preview=False):
    global metricized_articles, terms_processed

    # Get article from the DB...
    analyzed_article = read_analyzed_article_from_db(article_id)

    # Create metrics...
    if analyzed_article:
        # There are a few different approaches to consider when writing
        # metric data.
        # (1) Ensure documents are allocated, and then upsert data for
        #     the change
        # (2) Upsert an entire doc each time, where all values are zero
        #     except one.
        # (3) Upsert daily term docs one at a time, then aggregate into
        #     higher level data later
        #
        # >> Currently selecting to do the former approach.  Results in
        #    more small reads to the DB, but smaller writes.
    
        # Get needed date values
        published = analyzed_article["published"]
        yyyy = published.year
        mm = published.month
        dd = published.day
        first_of_month = datetime(yyyy, mm, 1)
        days_in_curr_month = date_util.get_days_in_month(yyyy, published.month)

        # Iterate over each term in the term histogram
        term_histogram = analyzed_article["term_histogram"]
        for term in term_histogram:
            terms_processed += 1

            if not preview:
                update_daily_metrics(term, yyyy, mm, dd, first_of_month,
                    days_in_curr_month, term_histogram[term])
                update_monthly_metrics(term, yyyy, mm, term_histogram[term])

        # Increase count and update status after each article...
        metricized_articles += 1
        if preview or metricized_articles % updt_freq == 0:
            print "  * Articles Metricized: %d..." % metricized_articles
            print "      Terms: %d    Daily Docs %d    Monthly Docs %d" % \
                    (terms_processed, docs_created_daily, docs_created_monthly)
            print "      Monthly:  Read: %s,  Create: %s,  Write: %s" % \
                    (mr_time, mc_time, mw_time)
            print "      Daily:    Read: %s,  Create: %s,  Write: %s" % \
                    (dr_time, dc_time, dw_time)

    else:
        print "  ERROR: No document with id of '%s' in DB" % article_id
コード例 #2
0
ファイル: metrics_service.py プロジェクト: lbracken/news_data
def get_daily_term_data(term, time_start, time_end, granularity):
    """ Get 'daily' granularity metric data for the given term and
        time range

    """
    data = []
    avg = 0 
    total = 0
    max_val = 0 
    data_pts = 0

    # Query the DB for all documents containing the
    # given term and within the given date range.
    db_query =   {"term" : term,
        "date" : {"$gte" : time_start},
        "date" : {"$lte" : time_end}}

    result_set = []
    result_set_idx = 0;
    for result in db_metric_data_daily.find(db_query).sort("date"):
        result_set.append(result)

    # Iterate over each month in the requested time range
    curr_month = time_start
    while (curr_month < time_end):

        # Typically the end day is the number of days in the month.
        # However, if this is the final month in the requested time
        # range, the day may be earlier based upon the end date.
        days_in_month = date_util.get_days_in_month(
                curr_month.year, curr_month.month)

        if date_util.is_same_month(curr_month, time_end):
            days_in_month = time_end.day

        data_pts += days_in_month

        # Get the next result from the result set (assume idx is in
        # bounds). If it matches the current month, then use its data
        # to build the response.  Otherwise, we have a gap in the data
        # which should be filled with an empty result.
        result = None
        if result_set_idx < len(result_set):
            result = result_set[result_set_idx]

        if result and date_util.is_same_month(curr_month, result["date"]):
            # Increment result_set_idx, if not 
            # already at the end of the list
            result_set_idx += 1 if result_set_idx < len(result_set) - 1 else 0
            for day in range(curr_month.day, days_in_month+1):
                val = result["daily"][str(day)]
                data.append(val)
                total += val
                if (val > max_val):
                    max_val = val
        else:
            for day in range(curr_month.day, days_in_month + 1):
                data.append(0)

        curr_month = date_util.get_next_month(curr_month)

    # Calculate the final average
    if data_pts > 0:
        avg = total / data_pts

    return data, total, avg, max_val