def collect_google_scores(terms, start, end): logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR) logger.info('Querying %d terms between %s and %s' % (len(terms), start, end)) logger.debug(', '.join(t.term for t in terms)) service = build( 'trends', 'v1beta', developerKey=os.environ["GOOGLE_API_KEY"], discoveryServiceUrl='https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest', cache_discovery=False ) graph = service.getTimelinesForHealth( terms=[t.term for t in terms], geoRestriction_region='GB-ENG', time_startDate=start.strftime('%Y-%m-%d'), time_endDate=end.strftime("%Y-%m-%d"), timelineResolution='day') try: response = graph.execute() except HttpError as e: logger.exception(e) raise e for line in response['lines']: term = next(t for t in terms if t.term == line['term']) for point in line['points']: day = datetime.strptime(point['date'], "%b %d %Y").date() gs = get_google_score(term, day) gs.value = float(point['value']) yield gs
def calculate_model_scores(model, start, end): logger.info('Calculating new ModelScores between %s and %s' % (start, end)) days_apart = (end - start).days + 1 engine_runner = buildCalculator(get_engine_conf()) days = (start + timedelta(days=d) for d in xrange(days_apart)) for day in days: s = calculate_score(model, day, engine_runner) if s: yield s
def run(model, start, end, **kwargs): # For each day in date range # for each day in day and previous week # Collect tweets # include hour's padding each side of each day # Run shrew over tweets # For each region # Run region's ngramcounter over tweets # For each region # Normalise scores over the week # Send normalised scores to matlab logger.info( "Run %s model between %s and %s" % (model.name, start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d'))) missing_twitter_score = list(days_missing_twitter_score(model, start, end)) if missing_twitter_score: s = min(missing_twitter_score) e = max(missing_twitter_score) logger.info('Calculating TwitterScores between %s and %s' % (s.strftime('%Y-%m-%d'), e.strftime('%Y-%m-%d'))) for ts in calculate_twitter_scores(model, s, e): db.session.add(ts) else: logger.info('TwitterScores already calculated') missing_model_score = list(days_missing_model_score(model, start, end)) if missing_model_score: s = min(missing_model_score) e = max(missing_model_score) for ms in calculate_model_scores(model, s, e): db.session.add(ms) else: logger.info('ModelScores already calculated') db.session.commit()
def run(model, start, end, csv_file=None, **kwargs): if csv_file is None: raise FluDetectorError('No CSV file provided') logger.info('Reading CSV file into %s' % str(model)) csv_reader = csv.reader(csv_file) headers = next(csv_reader) day_index = find_matching_index(headers, ['Day', 'Date'], required=True) region_index = find_region_index(headers) logger.debug('Found columns for regions %s' % ', '.join(region_index.keys())) logger.info('Reading rows...') for row_index, row in enumerate(csv_reader): day = datetime.strptime(row[day_index], '%Y-%m-%d').date() if day < start or day > end: continue for region, col_index in region_index.iteritems(): try: value = float(row[col_index]) except ValueError: logger.debug('Skipping row %d column %d, not a float' % (row_index + 1, col_index)) continue try: ms = ModelScore.query.filter_by(model_id=model.id, day=day, region=region).one() except NoResultFound: ms = ModelScore() ms.region = region ms.model = model ms.day = day ms.value = value db.session.add(ms) db.session.commit() logger.info('Done!')
def run(model, start, end, **kwargs): """ Run this model between these two dates. Running the model means: 1) Collecting Google Scores for the model's terms on these days 2) Using the Google Scores to calculate the Model Scores Tries to be clever about when it needs to collect Google Data or not: - Find dates between start and end that are missing a GoogleScore for at least one of the terms - Query for all terms over the smallest date range that covers all missing dates. - e.g. Same run twice won't hit Google the second time - e.g. Two runs that overlap will never query the days in common - e.g. Removing a term from a model won't cause extra queries - e.g. Adding a term to a model will re-query every term """ logger.info("Run %s model between %s and %s" % (model.name, start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d'))) needs_collecting = list(days_missing_google_score(model, start, end)) if needs_collecting: # Go back a day to make sure the API can return some data # When the dates are near to date.today() you don't get empty responses # you get 400s window_size = model.get_data()['average_window_size'] collect_start = min(needs_collecting) - timedelta(days=window_size) collect_end = max(needs_collecting) batched = list(batches(model, collect_start, collect_end)) # Assuming 1 second to make and process call with worst case rate limiting secs_per_batch = 5 + 3 + 3 ^ 2 + 3 ^ 3 + 3 ^ 4 + 3 ^ 5 td = timedelta(seconds=len(batched) * secs_per_batch) logger.info('Due to rate limiting, querying Google will take at most %s' % str(td)) for batch, s, e in batched: run_batch(batch, s, e) else: logger.info('GoogleScores already collected') msg_date = None msg_value = None needs_calculating = list(days_missing_model_score(model, start, end)) if needs_calculating: calculate_start = min(needs_calculating) calculate_end = max(needs_calculating) td = timedelta(seconds=(calculate_end - calculate_start).days * 8) # Assuming 8 seconds to process each day logger.info('To process all days in Matlab/Octave will take roughly %s' % str(td)) for ms in calculate_model_scores(model, calculate_start, calculate_end): db.session.add(ms) if os.environ['TWITTER_ENABLED'] == 'True': msg_date = ms.day msg_value = ms.value else: logger.info('ModelScores already calculated') db.session.commit() if msg_date is not None and msg_value is not None: send_score_to_message_queue(msg_date, msg_value) logger.info('Latest ModelScore value sent to message queue')