def snapshot_twitteraccounts(verbose=False): """Create today's SnapshotOfTwitterAccounts""" api = tweepy.API() for screen_name in TRACKED_ACCOUNTS: if verbose: print 'Scraping %s...' % screen_name u = api.get_user(screen_name) followers = u.followers_count following = u.friends_count tweets = u.statuses_count today = datetime.now().date() # How long since we scraped this account? latest = Session.query(SnapshotOfTwitterAccount)\ .filter(SnapshotOfTwitterAccount.screen_name==screen_name)\ .order_by(SnapshotOfTwitterAccount.timestamp.desc())\ .first() if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshot for %s has already been processed.' % screen_name continue # Create a snapshot sn = SnapshotOfTwitterAccount(\ timestamp=today,\ screen_name=screen_name,\ followers=followers,\ following=following,\ tweets=tweets) Session.add(sn) if verbose: print ' -> ',sn.toJson() Session.commit()
def snapshot_googleanalytics(verbose=False): googleanalytics_auth_json=os.environ.get('GOOGLEANALYTICS_AUTH') # Authenticate and construct service. service = initialize_service(googleanalytics_auth_json) date_string = (datetime.now()-timedelta(days=4)).date().isoformat() if verbose: print 'Snapshotting for '+date_string for x in iterate_profiles(service): try: profile_id = x['id'] # How long since we scraped this account? latest = Session.query(SnapshotOfAnalytics)\ .filter(SnapshotOfAnalytics.website==x['name'])\ .order_by(SnapshotOfAnalytics.timestamp.desc())\ .first() day = (datetime.now()-timedelta(days=1)).date() if latest and latest.timestamp>=day: if verbose: print ' -> most recent snapshot for %s has already been processed.' % x['name'] continue hits = get_hits(service, profile_id, day.isoformat()) sn = SnapshotOfAnalytics(timestamp=day,website=x['name'],hits=hits) Session.add(sn) if verbose: print '%s: %d' % (x['name'], hits) except Exception, e: print e
def history__facebook(): grain = _get_grain() # Date filter date_group = func.date_trunc(grain, SnapshotOfFacebook.timestamp) # Grouped query S = SnapshotOfFacebook q = Session.query()\ .add_column( date_group )\ .add_column( func.max(S.likes) )\ .group_by(date_group)\ .order_by(date_group.desc()) response = _prepare(q.count()) q = q.offset( response['offset'] )\ .limit( response['per_page'] ) # Inner function transforms SELECT tuple into recognizable format _dictize = lambda x: { 'timestamp':x[0].isoformat(), 'likes':x[1] } results = { 'history': [ _dictize(x) for x in q ], 'likes' : Session.query(S).order_by(S.timestamp.desc()).first().likes } # Write response response['grain'] = grain response['data'] = results return response
def get_activity(verbose=False): lists = util.list_mailman_lists(verbose) for l in lists: if verbose: print "Processing activity for %s..." % l["name"] latest = ( Session.query(ActivityInMailman) .filter(ActivityInMailman.list_name == l["name"]) .order_by(ActivityInMailman.message_id.desc()) .first() ) # Walk through message history from the web front-end archive_url = l["link"].replace("mailman/listinfo", "pipermail") limit = 1000 latest_id = latest.message_id if latest else -1 for msg in _yield_messages(archive_url, latest_id, verbose=verbose): if verbose: print ' -> got msg #%d (%s: "%s")' % (msg["id"], msg["email"], msg["subject"]) Session.add( ActivityInMailman( list_name=l["name"], message_id=msg["id"], subject=msg["subject"], author=msg["author"], email=msg["email"], link=msg["link"], timestamp=msg["date"], ) ) limit -= 1 # if limit==0: # if verbose: print ' -> Reached activity limit (100)' # break; Session.commit()
def snapshot_twitteraccounts(verbose=False): """Create today's SnapshotOfTwitterAccounts""" api = open_api() friends = api.GetFriends() for friend in friends: if verbose: print 'Scraping %s...' % friend.screen_name screen_name = friend.screen_name.lower() if screen_name=='theannotator': # legacy reasons screen_name = 'TheAnnotator' followers = friend.followers_count following = friend.friends_count tweets = friend.statuses_count today = datetime.now().date() # How long since we scraped this account? latest = Session.query(SnapshotOfTwitterAccount)\ .filter(SnapshotOfTwitterAccount.screen_name==screen_name)\ .order_by(SnapshotOfTwitterAccount.timestamp.desc())\ .first() if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshot for %s has already been processed.' % screen_name continue # Create a snapshot sn = SnapshotOfTwitterAccount(\ timestamp=today,\ screen_name=screen_name,\ followers=followers,\ following=following,\ tweets=tweets) Session.add(sn) if verbose: print ' -> ',sn.toJson() Session.commit()
def snapshot_mailchimp(verbose=False): api_key = os.environ.get('MAILCHIMP_API_KEY') assert api_key, 'No MAILCHIMP_API_KEY defined in environment.' pm = PostMonkey(api_key, timeout=10) ping_string = pm.ping() expected = u'Everything\'s Chimpy!' assert ping_string==expected, 'Bad handshake, got "%s", expected "%s"' % (ping_string,expected) if verbose: print 'handshake ok' lists = pm.lists() if not 'data' in lists: print 'Got bad lists object from server.' pprint(lists) raise ValueError('Bad lists object from server') # Snapshot creation code... today = datetime.now().date() for l in lists['data']: try: if verbose: print 'Scraping %s...' % l['name'] latest = Session.query(model.SnapshotOfMailchimp)\ .filter(model.SnapshotOfMailchimp.name==l['name'])\ .order_by(model.SnapshotOfMailchimp.timestamp.desc())\ .first() if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshots have already been processed.' continue snapshot = model.SnapshotOfMailchimp(\ name = l['name'],\ members = l['stats']['member_count'], timestamp = today) if verbose: print ' -> ',snapshot.toJson() Session.add(snapshot) Session.commit() except Exception, e: pprint({'list':l,'exception':str(e)})
def history__github(): grain = _get_grain() # Filtered list of github IDs repo = request.args.get('repo', None) repoFilter = None if repo is not None: repo = repo.split(',') repoFilter = SnapshotOfGithub.repo_name.in_(repo) # Date filter date_group = func.date_trunc(grain, SnapshotOfGithub.timestamp) # Query: Range of dates q1 = Session.query()\ .add_column( func.distinct(date_group).label('d') )\ .order_by(date_group.desc()) response = _prepare(q1.count()) q1 = q1.offset( response['offset'] )\ .limit( response['per_page'] ) if q1.count(): date_column = q1.subquery().columns.d (min_date,max_date) = Session.query(func.min(date_column), func.max(date_column)).first() else: # Impossible date range (min_date,max_date) = datetime.now()+timedelta(days=1),datetime.now() # Grouped query S = SnapshotOfGithub q = Session.query()\ .add_column( func.sum(S.watchers) )\ .add_column( func.max(S.forks) )\ .add_column( func.max(S.open_issues) )\ .add_column( func.max(S.size) )\ .add_column( date_group )\ .add_column( S.repo_name )\ .group_by(date_group)\ .group_by(S.repo_name)\ .order_by(date_group.desc())\ .filter( date_group>=min_date )\ .filter( date_group<=max_date )\ .filter( repoFilter ) results = {} _dictize = lambda x: { 'watchers':x[0], 'forks':x[1], 'issues':x[2], 'size':x[3], 'timestamp':x[4].date().isoformat(), } for x in q: repo_name = x[5] results[repo_name] = results.get(repo_name, { 'repo':repo_name, 'data':[] }) results[repo_name]['data'].append( _dictize(x) ) # Inner function transforms SELECT tuple into recognizable format response['grain'] = grain response['data'] = results response['repos'] = repo response['min_date'] = min_date.date().isoformat() response['max_date'] = max_date.date().isoformat() return response
def history__mailman(): grain = _get_grain() # Filtered list of mailman IDs lists = request.args.get('list') listFilter = None if lists is not None: lists = lists.split(',') listFilter = SnapshotOfMailman.list_name.in_(lists) # Date filter date_group = func.date_trunc(grain, SnapshotOfMailman.timestamp) # Query: Range of dates q1 = Session.query()\ .add_column( func.distinct(date_group).label('d') )\ .order_by(date_group.desc()) response = _prepare(q1.count()) q1 = q1.offset( response['offset'] )\ .limit( response['per_page'] ) if q1.count(): subquery = q1.subquery() (min_date,max_date) = Session.query(func.min(subquery.columns.d), func.max(subquery.columns.d)).first() else: # Impossible date range (min_date,max_date) = datetime.now()+timedelta(days=1),datetime.now() # Grouped query S = SnapshotOfMailman q = Session.query()\ .add_column( func.sum(S.posts_today) )\ .add_column( func.max(S.subscribers) )\ .add_column( date_group )\ .add_column( S.list_name )\ .group_by(date_group)\ .group_by(S.list_name)\ .order_by(date_group.desc())\ .filter( date_group>=min_date )\ .filter( date_group<=max_date )\ .filter( listFilter ) results = {} # Inner function transforms SELECT tuple into recognizable format _dictize = lambda x: { 'posts':x[0], 'subscribers':x[1], 'timestamp':x[2].isoformat(), } # Build output datastructure from rows for x in q: list_name = x[3] results[list_name] = results.get(list_name, { 'list_name':list_name, 'data':[] }) results[list_name]['data'].append( _dictize(x) ) # Write response response['grain'] = grain response['data'] = results response['list'] = lists response['min_date'] = min_date.isoformat() response['max_date'] = max_date.isoformat() return response
def data__mailman(): """Unpaginated -- there are less than 200 entries in the database""" response = {'ok' : True} q = Session.query(SnapshotOfMailman.list_name).distinct().order_by(SnapshotOfMailman.list_name) response['data'] = [ x[0] for x in q ] response['total'] = q.count() return response
def data__analytics(): """Unpaginated -- there are less than 200 entries in the database""" response = {'ok' : True} q = Session.query(SnapshotOfAnalytics.website).distinct().order_by(SnapshotOfAnalytics.website) response['data'] = [ x[0] for x in q ] response['total'] = q.count() return response
def history__analytics(): grain = _get_grain() websites = request.args.get('website',None) # Filter by account name websitefilter = None if websites is not None: websites = websites.split(',') websitefilter = SnapshotOfAnalytics.website.in_(websites) # Query: Range of dates date_group = func.date_trunc(grain, SnapshotOfAnalytics.timestamp) q1 = Session.query()\ .add_column( func.distinct(date_group).label('d') )\ .order_by(date_group.desc()) response = _prepare(q1.count()) q1 = q1.offset( response['offset'] )\ .limit( response['per_page'] ) if q1.count(): date_column = q1.subquery().columns.d (min_date,max_date) = Session.query(func.min(date_column), func.max(date_column)).first() else: # Impossible date range (min_date,max_date) = datetime.now()+timedelta(days=1),datetime.now() # Grouped query S = SnapshotOfAnalytics q = Session.query()\ .add_column( func.sum(S.hits) )\ .add_column( date_group )\ .add_column( S.website )\ .group_by(date_group)\ .group_by(S.website)\ .order_by(date_group.desc())\ .filter( date_group>=min_date )\ .filter( date_group<=max_date )\ .filter( websitefilter ) results = {} _dictize = lambda x: { 'hits':x[0], 'timestamp':x[1].date().isoformat(), } for x in q: website = x[2] x = _dictize(x) results[website] = results.get(website, { 'website':website,'data':[] }) results[website]['data'].append(x) response['data'] = results response['grain'] = grain return response
def snapshot_repos(verbose=False): """Create SnapshotOfRepo objects in the database for every day since the last time this was run.""" repo_list = _get_repo_list(verbose) today = datetime.now().date() for (repo_name,repo) in repo_list.items(): if verbose: print 'Processing snapshots for %s...' % repo_name latest = Session.query(SnapshotOfGithub)\ .filter(SnapshotOfGithub.repo_name==repo_name)\ .order_by(SnapshotOfGithub.timestamp.desc())\ .first() # By default, gather 30 days of snapshots if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshots have already been processed.' continue # Snapshot date for the last day (or more) snapshot = SnapshotOfGithub( timestamp=today, repo_name=repo_name, open_issues=repo.open_issues, size=repo.size, watchers=repo.watchers, forks=repo.forks ) if verbose: print ' -> ',snapshot.toJson() Session.add(snapshot) Session.commit()
def activity__mailman(): select_lists = request.args.get('list',None) q = Session.query(ActivityInMailman)\ .order_by(ActivityInMailman.timestamp.desc()) if select_lists is not None: select_lists = select_lists.split(',') q = q.filter(func.lower(ActivityInMailman.list_name).in_(select_lists)) response = _prepare( q.count() ) q = q.offset(response['offset'])\ .limit(response['per_page']) response['data'] = [ x.toJson() for x in q ] return response
def snapshot_mailman(verbose=False): lists = util.list_mailman_lists(verbose) today = datetime.now().date() for l in lists: if verbose: print 'Processing snapshots for %s...' % l['name'] latest = Session.query(SnapshotOfMailman)\ .filter(SnapshotOfMailman.list_name==l['name'])\ .order_by(SnapshotOfMailman.timestamp.desc())\ .first() # By default, gather 30 days of snapshots since = today - timedelta(days=180) if latest: if latest.timestamp>=today: if verbose: print ' -> most recent snapshots have already been processed.' continue since = latest.timestamp + timedelta(days=1) # Download subscriber list roster_url = l['link'].replace('listinfo','roster') num_subscribers = len(_scrape_subscribers(roster_url, verbose=verbose)) # Create a snapshot of each day while since<today: posts_today = Session.query(ActivityInMailman)\ .filter(ActivityInMailman.list_name==l['name'])\ .filter(ActivityInMailman.timestamp.between(since,since+timedelta(days=1)))\ .count() sn = SnapshotOfMailman(\ list_name=l['name'],\ timestamp=since,\ subscribers=num_subscribers, posts_today=posts_today) Session.add(sn) if verbose: print ' -> ',sn.toJson() since += timedelta(days=1) # Walk through message history, counting messages per day Session.commit()
def snapshot_facebook(verbose=False): api = facebook.GraphAPI() obj = api.get_object('/OKFNetwork') if not 'likes' in obj: print 'Got bad object from server.' pprint(obj) raise ValueError('Bad object from server') likes = obj['likes'] if verbose: print 'Likes today: %d' % likes # Snapshot creation code... today = datetime.now().date() latest = Session.query(model.SnapshotOfFacebook)\ .order_by(model.SnapshotOfFacebook.timestamp.desc())\ .first() if latest and latest.timestamp>=today: if verbose: print ' -> most recent snapshots have already been processed.' return snapshot = model.SnapshotOfFacebook(likes=likes, timestamp=today) if verbose: print ' -> ',snapshot.toJson() Session.add(snapshot) Session.commit()
def history__twitter(): grain = _get_grain() accountnames = request.args.get('name',None) # Filter by account name accountFilter = None if accountnames is not None: accountnames = accountnames.split(',') accountFilter = SnapshotOfTwitterAccount.screen_name.in_(accountnames) # Query: Range of dates date_group = func.date_trunc(grain, SnapshotOfTwitterAccount.timestamp) q1 = Session.query()\ .add_column( func.distinct(date_group).label('d') )\ .order_by(date_group.desc()) response = _prepare(q1.count()) q1 = q1.offset( response['offset'] )\ .limit( response['per_page'] ) if q1.count(): date_column = q1.subquery().columns.d (min_date,max_date) = Session.query(func.min(date_column), func.max(date_column)).first() else: # Impossible date range (min_date,max_date) = datetime.now()+timedelta(days=1),datetime.now() # Grouped query S = SnapshotOfTwitterAccount q = Session.query()\ .add_column( func.max(S.followers) )\ .add_column( func.max(S.following) )\ .add_column( func.max(S.tweets) )\ .add_column( date_group )\ .add_column( S.screen_name )\ .group_by(date_group)\ .group_by(S.screen_name)\ .order_by(date_group.desc())\ .filter( date_group>=min_date )\ .filter( date_group<=max_date )\ .filter( accountFilter ) results = {} _dictize = lambda x: { 'followers':x[0], 'following':x[1], 'tweets':x[2], 'timestamp':x[3].date().isoformat(), 'screen_name':x[4] } for x in q: x = _dictize(x) name = x['screen_name'] if not (name in results): latest = Session.query(S).filter(S.screen_name==name).order_by(S.timestamp.desc()).first() results[name] = { 'account':{ 'screen_name':name, 'followers':latest.followers, 'following':latest.following, 'tweets':latest.tweets, }, 'data':[] } results[name]['data'].append(x) response['data'] = results response['grain'] = grain return response
latest = Session.query(SnapshotOfAnalytics)\ .filter(SnapshotOfAnalytics.website==x['name'])\ .order_by(SnapshotOfAnalytics.timestamp.desc())\ .first() day = (datetime.now()-timedelta(days=1)).date() if latest and latest.timestamp>=day: if verbose: print ' -> most recent snapshot for %s has already been processed.' % x['name'] continue hits = get_hits(service, profile_id, day.isoformat()) sn = SnapshotOfAnalytics(timestamp=day,website=x['name'],hits=hits) Session.add(sn) if verbose: print '%s: %d' % (x['name'], hits) except Exception, e: print e Session.commit() def initialize_service(googleanalytics_auth_json): # Create an httplib2.Http object to handle our HTTP requests. http = httplib2.Http() # Prepare credentials, and authorize HTTP object with them. assert googleanalytics_auth_json,'No GOOGLEANALYTICS_AUTH set in environment. This should be the sample.dat file created by authenticating a sample app with Google Analytics.\n\n Read: https://developers.google.com/analytics/solutions/articles/hello-analytics-api' credentials = Credentials.new_from_json(googleanalytics_auth_json) if credentials is None or credentials.invalid: credentials = run(FLOW, storage) http = credentials.authorize(http) # Retrieve service. return build('analytics', 'v3', http=http) def iterate_profiles(service): accounts = service.management().accounts().list().execute()