Example #1
0
def collect_google_scores(terms, start, end):
    logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR)
    logger.info('Querying %d terms between %s and %s' % (len(terms), start, end))
    logger.debug(', '.join(t.term for t in terms))
    service = build(
        'trends',
        'v1beta',
        developerKey=os.environ["GOOGLE_API_KEY"],
        discoveryServiceUrl='https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest',
        cache_discovery=False
    )
    graph = service.getTimelinesForHealth(
        terms=[t.term for t in terms],
        geoRestriction_region='GB-ENG',
        time_startDate=start.strftime('%Y-%m-%d'),
        time_endDate=end.strftime("%Y-%m-%d"),
        timelineResolution='day')
    try:
        response = graph.execute()
    except HttpError as e:
        logger.exception(e)
        raise e
    for line in response['lines']:
        term = next(t for t in terms if t.term == line['term'])
        for point in line['points']:
            day = datetime.strptime(point['date'], "%b %d %Y").date()
            gs = get_google_score(term, day)
            gs.value = float(point['value'])
            yield gs
Example #2
0
def calculate_model_scores(model, start, end):
    logger.info('Calculating new ModelScores between %s and %s' % (start, end))
    days_apart = (end - start).days + 1
    engine_runner = buildCalculator(get_engine_conf())
    days = (start + timedelta(days=d) for d in xrange(days_apart))
    for day in days:
        s = calculate_score(model, day, engine_runner)
        if s:
            yield s
Example #3
0
def run(model, start, end, **kwargs):
    # For each day in date range
    # for each day in day and previous week
    # Collect tweets
    # include hour's padding each side of each day
    # Run shrew over tweets
    # For each region
    # Run region's ngramcounter over tweets
    # For each region
    # Normalise scores over the week
    # Send normalised scores to matlab
    logger.info(
        "Run %s model between %s and %s" %
        (model.name, start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')))

    missing_twitter_score = list(days_missing_twitter_score(model, start, end))
    if missing_twitter_score:
        s = min(missing_twitter_score)
        e = max(missing_twitter_score)
        logger.info('Calculating TwitterScores between %s and %s' %
                    (s.strftime('%Y-%m-%d'), e.strftime('%Y-%m-%d')))
        for ts in calculate_twitter_scores(model, s, e):
            db.session.add(ts)
    else:
        logger.info('TwitterScores already calculated')

    missing_model_score = list(days_missing_model_score(model, start, end))
    if missing_model_score:
        s = min(missing_model_score)
        e = max(missing_model_score)
        for ms in calculate_model_scores(model, s, e):
            db.session.add(ms)
    else:
        logger.info('ModelScores already calculated')

    db.session.commit()
Example #4
0
def run(model, start, end, csv_file=None, **kwargs):
    if csv_file is None:
        raise FluDetectorError('No CSV file provided')
    logger.info('Reading CSV file into %s' % str(model))
    csv_reader = csv.reader(csv_file)

    headers = next(csv_reader)

    day_index = find_matching_index(headers, ['Day', 'Date'], required=True)
    region_index = find_region_index(headers)

    logger.debug('Found columns for regions %s' %
                 ', '.join(region_index.keys()))

    logger.info('Reading rows...')
    for row_index, row in enumerate(csv_reader):
        day = datetime.strptime(row[day_index], '%Y-%m-%d').date()

        if day < start or day > end:
            continue

        for region, col_index in region_index.iteritems():
            try:
                value = float(row[col_index])
            except ValueError:
                logger.debug('Skipping row %d column %d, not a float' %
                             (row_index + 1, col_index))
                continue
            try:
                ms = ModelScore.query.filter_by(model_id=model.id,
                                                day=day,
                                                region=region).one()
            except NoResultFound:
                ms = ModelScore()
                ms.region = region
                ms.model = model
                ms.day = day
            ms.value = value
            db.session.add(ms)

    db.session.commit()
    logger.info('Done!')
Example #5
0
def run(model, start, end, **kwargs):
    """
    Run this model between these two dates. Running the model means:
        1) Collecting Google Scores for the model's terms on these days
        2) Using the Google Scores to calculate the Model Scores
    Tries to be clever about when it needs to collect Google Data or not:
        - Find dates between start and end that are missing a GoogleScore for
            at least one of the terms
        - Query for all terms over the smallest date range that covers all
            missing dates.
        - e.g. Same run twice won't hit Google the second time
        - e.g. Two runs that overlap will never query the days in common
        - e.g. Removing a term from a model won't cause extra queries
        - e.g. Adding a term to a model will re-query every term
    """
    logger.info("Run %s model between %s and %s" % (model.name, start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')))

    needs_collecting = list(days_missing_google_score(model, start, end))
    if needs_collecting:
        # Go back a day to make sure the API can return some data
        # When the dates are near to date.today() you don't get empty responses
        # you get 400s
        window_size = model.get_data()['average_window_size']
        collect_start = min(needs_collecting) - timedelta(days=window_size)
        collect_end = max(needs_collecting)

        batched = list(batches(model, collect_start, collect_end))
        # Assuming 1 second to make and process call with worst case rate limiting
        secs_per_batch = 5 + 3 + 3 ^ 2 + 3 ^ 3 + 3 ^ 4 + 3 ^ 5
        td = timedelta(seconds=len(batched) * secs_per_batch)
        logger.info('Due to rate limiting, querying Google will take at most %s' % str(td))

        for batch, s, e in batched:
            run_batch(batch, s, e)
    else:
        logger.info('GoogleScores already collected')

    msg_date = None
    msg_value = None
    needs_calculating = list(days_missing_model_score(model, start, end))
    if needs_calculating:
        calculate_start = min(needs_calculating)
        calculate_end = max(needs_calculating)

        td = timedelta(seconds=(calculate_end - calculate_start).days * 8)  # Assuming 8 seconds to process each day
        logger.info('To process all days in Matlab/Octave will take roughly %s' % str(td))

        for ms in calculate_model_scores(model, calculate_start, calculate_end):
            db.session.add(ms)
            if os.environ['TWITTER_ENABLED'] == 'True':
                msg_date = ms.day
                msg_value = ms.value
    else:
        logger.info('ModelScores already calculated')

    db.session.commit()
    if msg_date is not None and msg_value is not None:
        send_score_to_message_queue(msg_date, msg_value)
        logger.info('Latest ModelScore value sent to message queue')