def snapshot_twitteraccounts(verbose=False):
    """Create today's SnapshotOfTwitterAccounts"""
    api = tweepy.API()
    for screen_name in TRACKED_ACCOUNTS:
        if verbose: print 'Scraping %s...' % screen_name
        u = api.get_user(screen_name)
        followers = u.followers_count
        following = u.friends_count
        tweets = u.statuses_count
        today = datetime.now().date()
        # How long since we scraped this account?
        latest = Session.query(SnapshotOfTwitterAccount)\
                .filter(SnapshotOfTwitterAccount.screen_name==screen_name)\
                .order_by(SnapshotOfTwitterAccount.timestamp.desc())\
                .first()
        if latest and latest.timestamp>=today:
            if verbose: print ' -> most recent snapshot for %s has already been processed.' % screen_name
            continue
        # Create a snapshot 
        sn = SnapshotOfTwitterAccount(\
                timestamp=today,\
                screen_name=screen_name,\
                followers=followers,\
                following=following,\
                tweets=tweets)
        Session.add(sn)
        if verbose: print '  -> ',sn.toJson()
    Session.commit()
def snapshot_googleanalytics(verbose=False):
    googleanalytics_auth_json=os.environ.get('GOOGLEANALYTICS_AUTH')
    # Authenticate and construct service.
    service = initialize_service(googleanalytics_auth_json)
    date_string = (datetime.now()-timedelta(days=4)).date().isoformat()
    if verbose:
        print 'Snapshotting for '+date_string
    for x in iterate_profiles(service):
        try:
            profile_id = x['id']
            # How long since we scraped this account?
            latest = Session.query(SnapshotOfAnalytics)\
                    .filter(SnapshotOfAnalytics.website==x['name'])\
                    .order_by(SnapshotOfAnalytics.timestamp.desc())\
                    .first()
            day = (datetime.now()-timedelta(days=1)).date()
            if latest and latest.timestamp>=day:
                if verbose: print ' -> most recent snapshot for %s has already been processed.' % x['name']
                continue
            hits = get_hits(service, profile_id, day.isoformat())
            sn = SnapshotOfAnalytics(timestamp=day,website=x['name'],hits=hits)
            Session.add(sn)
            if verbose:
                print '%s: %d' % (x['name'], hits)
        except Exception, e:
            print e
Exemple #3
0
def history__facebook():
    grain = _get_grain()
    # Date filter
    date_group = func.date_trunc(grain, SnapshotOfFacebook.timestamp)
    # Grouped query
    S = SnapshotOfFacebook
    q = Session.query()\
            .add_column( date_group )\
            .add_column( func.max(S.likes) )\
            .group_by(date_group)\
            .order_by(date_group.desc())
    response = _prepare(q.count())
    q = q.offset( response['offset'] )\
          .limit( response['per_page'] )
    # Inner function transforms SELECT tuple into recognizable format
    _dictize = lambda x: {
        'timestamp':x[0].isoformat(),
        'likes':x[1]
    }
    results = {
            'history': [ _dictize(x) for x in q ],
            'likes' : Session.query(S).order_by(S.timestamp.desc()).first().likes
            }
    # Write response
    response['grain'] = grain
    response['data'] = results
    return response
def get_activity(verbose=False):
    lists = util.list_mailman_lists(verbose)
    for l in lists:
        if verbose:
            print "Processing activity for %s..." % l["name"]
        latest = (
            Session.query(ActivityInMailman)
            .filter(ActivityInMailman.list_name == l["name"])
            .order_by(ActivityInMailman.message_id.desc())
            .first()
        )
        # Walk through message history from the web front-end
        archive_url = l["link"].replace("mailman/listinfo", "pipermail")
        limit = 1000
        latest_id = latest.message_id if latest else -1
        for msg in _yield_messages(archive_url, latest_id, verbose=verbose):
            if verbose:
                print '  -> got msg #%d (%s: "%s")' % (msg["id"], msg["email"], msg["subject"])
            Session.add(
                ActivityInMailman(
                    list_name=l["name"],
                    message_id=msg["id"],
                    subject=msg["subject"],
                    author=msg["author"],
                    email=msg["email"],
                    link=msg["link"],
                    timestamp=msg["date"],
                )
            )
            limit -= 1
            # if limit==0:
            # if verbose: print '  -> Reached activity limit (100)'
            # break;
        Session.commit()
Exemple #5
0
def snapshot_twitteraccounts(verbose=False):
    """Create today's SnapshotOfTwitterAccounts"""
    api = open_api()
    friends = api.GetFriends()

    for friend in friends:
        if verbose: print 'Scraping %s...' % friend.screen_name
        screen_name = friend.screen_name.lower()
        if screen_name=='theannotator':
            # legacy reasons
            screen_name = 'TheAnnotator'
        followers = friend.followers_count
        following = friend.friends_count
        tweets = friend.statuses_count
        today = datetime.now().date()
        # How long since we scraped this account?
        latest = Session.query(SnapshotOfTwitterAccount)\
                .filter(SnapshotOfTwitterAccount.screen_name==screen_name)\
                .order_by(SnapshotOfTwitterAccount.timestamp.desc())\
                .first()
        if latest and latest.timestamp>=today:
            if verbose: print ' -> most recent snapshot for %s has already been processed.' % screen_name
            continue
        # Create a snapshot 
        sn = SnapshotOfTwitterAccount(\
                timestamp=today,\
                screen_name=screen_name,\
                followers=followers,\
                following=following,\
                tweets=tweets)
        Session.add(sn)
        if verbose: print '  -> ',sn.toJson()
    Session.commit()
Exemple #6
0
def snapshot_mailchimp(verbose=False):
    api_key = os.environ.get('MAILCHIMP_API_KEY')
    assert api_key, 'No MAILCHIMP_API_KEY defined in environment.'
    pm = PostMonkey(api_key, timeout=10)
    ping_string = pm.ping()
    expected = u'Everything\'s Chimpy!'
    assert ping_string==expected, 'Bad handshake, got "%s", expected "%s"' % (ping_string,expected)
    if verbose:
        print 'handshake ok'
    lists = pm.lists()
    if not 'data' in lists:
        print 'Got bad lists object from server.'
        pprint(lists)
        raise ValueError('Bad lists object from server')
    # Snapshot creation code...
    today = datetime.now().date()
    for l in lists['data']:
        try:
            if verbose: print 'Scraping %s...' % l['name']
            latest = Session.query(model.SnapshotOfMailchimp)\
                    .filter(model.SnapshotOfMailchimp.name==l['name'])\
                    .order_by(model.SnapshotOfMailchimp.timestamp.desc())\
                    .first()
            if latest and latest.timestamp>=today:
                if verbose: print ' -> most recent snapshots have already been processed.'
                continue
            snapshot = model.SnapshotOfMailchimp(\
                    name = l['name'],\
                    members = l['stats']['member_count'],
                    timestamp = today)
            if verbose: print '  -> ',snapshot.toJson()
            Session.add(snapshot)
            Session.commit()
        except Exception, e:
            pprint({'list':l,'exception':str(e)})
Exemple #7
0
def history__github():
    grain = _get_grain()
    # Filtered list of github IDs
    repo = request.args.get('repo', None)
    repoFilter = None
    if repo is not None:
        repo = repo.split(',')
        repoFilter = SnapshotOfGithub.repo_name.in_(repo)
    # Date filter
    date_group = func.date_trunc(grain, SnapshotOfGithub.timestamp)
    # Query: Range of dates
    q1 = Session.query()\
            .add_column( func.distinct(date_group).label('d') )\
            .order_by(date_group.desc())
    response = _prepare(q1.count())
    q1 = q1.offset( response['offset'] )\
            .limit( response['per_page'] )
    if q1.count():
        date_column = q1.subquery().columns.d
        (min_date,max_date) = Session.query(func.min(date_column), func.max(date_column)).first()
    else:
        # Impossible date range
        (min_date,max_date) = datetime.now()+timedelta(days=1),datetime.now()
    # Grouped query
    S = SnapshotOfGithub
    q = Session.query()\
            .add_column( func.sum(S.watchers) )\
            .add_column( func.max(S.forks) )\
            .add_column( func.max(S.open_issues) )\
            .add_column( func.max(S.size) )\
            .add_column( date_group )\
            .add_column( S.repo_name )\
            .group_by(date_group)\
            .group_by(S.repo_name)\
            .order_by(date_group.desc())\
            .filter( date_group>=min_date )\
            .filter( date_group<=max_date )\
            .filter( repoFilter )
    results = {}
    _dictize = lambda x: {
        'watchers':x[0],
        'forks':x[1],
        'issues':x[2],
        'size':x[3],
        'timestamp':x[4].date().isoformat(),
    }
    for x in q:
        repo_name = x[5] 
        results[repo_name] = results.get(repo_name, { 'repo':repo_name, 'data':[] })
        results[repo_name]['data'].append( _dictize(x) )
    # Inner function transforms SELECT tuple into recognizable format
    response['grain'] = grain
    response['data'] = results
    response['repos'] = repo
    response['min_date'] = min_date.date().isoformat()
    response['max_date'] = max_date.date().isoformat()
    return response
Exemple #8
0
def history__mailman():
    grain = _get_grain()
    # Filtered list of mailman IDs
    lists = request.args.get('list')
    listFilter = None
    if lists is not None:
        lists = lists.split(',') 
        listFilter = SnapshotOfMailman.list_name.in_(lists)
    # Date filter
    date_group = func.date_trunc(grain, SnapshotOfMailman.timestamp)
    # Query: Range of dates
    q1 = Session.query()\
            .add_column( func.distinct(date_group).label('d') )\
            .order_by(date_group.desc())
    response = _prepare(q1.count())
    q1 = q1.offset( response['offset'] )\
            .limit( response['per_page'] )
    if q1.count():
        subquery = q1.subquery()
        (min_date,max_date) = Session.query(func.min(subquery.columns.d), func.max(subquery.columns.d)).first()
    else:
        # Impossible date range
        (min_date,max_date) = datetime.now()+timedelta(days=1),datetime.now()
    # Grouped query
    S = SnapshotOfMailman
    q = Session.query()\
            .add_column( func.sum(S.posts_today) )\
            .add_column( func.max(S.subscribers) )\
            .add_column( date_group )\
            .add_column( S.list_name )\
            .group_by(date_group)\
            .group_by(S.list_name)\
            .order_by(date_group.desc())\
            .filter( date_group>=min_date )\
            .filter( date_group<=max_date )\
            .filter( listFilter )
    results = {}
    # Inner function transforms SELECT tuple into recognizable format
    _dictize = lambda x: {
        'posts':x[0],
        'subscribers':x[1],
        'timestamp':x[2].isoformat(),
    }
    # Build output datastructure from rows
    for x in q:
        list_name = x[3]
        results[list_name] = results.get(list_name, { 'list_name':list_name, 'data':[] })
        results[list_name]['data'].append( _dictize(x) )
    # Write response
    response['grain'] = grain
    response['data'] = results
    response['list'] = lists
    response['min_date'] = min_date.isoformat()
    response['max_date'] = max_date.isoformat()
    return response
Exemple #9
0
def data__mailman():
    """Unpaginated -- there are less than 200 entries in the database"""
    response = {'ok' : True}
    q = Session.query(SnapshotOfMailman.list_name).distinct().order_by(SnapshotOfMailman.list_name)
    response['data'] = [ x[0] for x in q ]
    response['total'] = q.count()
    return response
Exemple #10
0
def data__analytics():
    """Unpaginated -- there are less than 200 entries in the database"""
    response = {'ok' : True}
    q = Session.query(SnapshotOfAnalytics.website).distinct().order_by(SnapshotOfAnalytics.website)
    response['data'] = [ x[0] for x in q ]
    response['total'] = q.count()
    return response
Exemple #11
0
def history__analytics():
    grain = _get_grain()
    websites = request.args.get('website',None)
    # Filter by account name
    websitefilter = None
    if websites is not None:
        websites = websites.split(',')
        websitefilter = SnapshotOfAnalytics.website.in_(websites)
    # Query: Range of dates
    date_group = func.date_trunc(grain, SnapshotOfAnalytics.timestamp)
    q1 = Session.query()\
            .add_column( func.distinct(date_group).label('d') )\
            .order_by(date_group.desc())
    response = _prepare(q1.count())
    q1 = q1.offset( response['offset'] )\
            .limit( response['per_page'] )
    if q1.count():
        date_column = q1.subquery().columns.d
        (min_date,max_date) = Session.query(func.min(date_column), func.max(date_column)).first()
    else:
        # Impossible date range
        (min_date,max_date) = datetime.now()+timedelta(days=1),datetime.now()
    # Grouped query
    S = SnapshotOfAnalytics
    q = Session.query()\
            .add_column( func.sum(S.hits) )\
            .add_column( date_group )\
            .add_column( S.website )\
            .group_by(date_group)\
            .group_by(S.website)\
            .order_by(date_group.desc())\
            .filter( date_group>=min_date )\
            .filter( date_group<=max_date )\
            .filter( websitefilter )
    results = {}
    _dictize = lambda x: {
        'hits':x[0],
        'timestamp':x[1].date().isoformat(),
    }
    for x in q:
        website = x[2]
        x = _dictize(x)
        results[website] = results.get(website, { 'website':website,'data':[] })
        results[website]['data'].append(x)
    response['data'] = results
    response['grain'] = grain
    return response
Exemple #12
0
def snapshot_repos(verbose=False):
    """Create SnapshotOfRepo objects in the database for 
       every day since the last time this was run."""
    repo_list = _get_repo_list(verbose)
    today = datetime.now().date()
    for (repo_name,repo) in repo_list.items():
        if verbose: print 'Processing snapshots for %s...' % repo_name
        latest = Session.query(SnapshotOfGithub)\
                .filter(SnapshotOfGithub.repo_name==repo_name)\
                .order_by(SnapshotOfGithub.timestamp.desc())\
                .first()
        # By default, gather 30 days of snapshots
        if latest and latest.timestamp>=today:
            if verbose: print ' -> most recent snapshots have already been processed.'
            continue
        # Snapshot date for the last day (or more)
        snapshot = SnapshotOfGithub( timestamp=today, repo_name=repo_name, open_issues=repo.open_issues, size=repo.size, watchers=repo.watchers, forks=repo.forks )
        if verbose: print '  -> ',snapshot.toJson()
        Session.add(snapshot)
        Session.commit()
Exemple #13
0
def activity__mailman():
    select_lists = request.args.get('list',None)
    q = Session.query(ActivityInMailman)\
            .order_by(ActivityInMailman.timestamp.desc())
    if select_lists is not None:
        select_lists = select_lists.split(',') 
        q = q.filter(func.lower(ActivityInMailman.list_name).in_(select_lists))
    response = _prepare( q.count() )
    q = q.offset(response['offset'])\
            .limit(response['per_page'])
    response['data'] = [ x.toJson() for x in q ]
    return response
Exemple #14
0
def snapshot_mailman(verbose=False):
    lists = util.list_mailman_lists(verbose)
    today = datetime.now().date()
    for l in lists:
        if verbose: print 'Processing snapshots for %s...' % l['name']
        latest = Session.query(SnapshotOfMailman)\
                .filter(SnapshotOfMailman.list_name==l['name'])\
                .order_by(SnapshotOfMailman.timestamp.desc())\
                .first()
        # By default, gather 30 days of snapshots
        since = today - timedelta(days=180)
        if latest:
            if latest.timestamp>=today:
                if verbose: print ' -> most recent snapshots have already been processed.'
                continue
            since = latest.timestamp + timedelta(days=1)
        # Download subscriber list
        roster_url = l['link'].replace('listinfo','roster')
        num_subscribers = len(_scrape_subscribers(roster_url, verbose=verbose))
        # Create a snapshot of each day
        while since<today:
            posts_today = Session.query(ActivityInMailman)\
                            .filter(ActivityInMailman.list_name==l['name'])\
                            .filter(ActivityInMailman.timestamp.between(since,since+timedelta(days=1)))\
                            .count()
            sn = SnapshotOfMailman(\
                    list_name=l['name'],\
                    timestamp=since,\
                    subscribers=num_subscribers,
                    posts_today=posts_today)
            Session.add(sn)
            if verbose: print '  -> ',sn.toJson()
            since += timedelta(days=1)
        # Walk through message history, counting messages per day
        Session.commit()
Exemple #15
0
def snapshot_facebook(verbose=False):
    api = facebook.GraphAPI()
    obj = api.get_object('/OKFNetwork')
    if not 'likes' in obj:
        print 'Got bad object from server.'
        pprint(obj)
        raise ValueError('Bad object from server')
    likes = obj['likes']
    if verbose:
        print 'Likes today: %d' % likes
    # Snapshot creation code...
    today = datetime.now().date()
    latest = Session.query(model.SnapshotOfFacebook)\
            .order_by(model.SnapshotOfFacebook.timestamp.desc())\
            .first()
    if latest and latest.timestamp>=today:
        if verbose: print ' -> most recent snapshots have already been processed.'
        return
    snapshot = model.SnapshotOfFacebook(likes=likes, timestamp=today)
    if verbose: print '  -> ',snapshot.toJson()
    Session.add(snapshot)
    Session.commit()
Exemple #16
0
def history__twitter():
    grain = _get_grain()
    accountnames = request.args.get('name',None)
    # Filter by account name
    accountFilter = None
    if accountnames is not None:
        accountnames = accountnames.split(',')
        accountFilter = SnapshotOfTwitterAccount.screen_name.in_(accountnames)
    # Query: Range of dates
    date_group = func.date_trunc(grain, SnapshotOfTwitterAccount.timestamp)
    q1 = Session.query()\
            .add_column( func.distinct(date_group).label('d') )\
            .order_by(date_group.desc())
    response = _prepare(q1.count())
    q1 = q1.offset( response['offset'] )\
            .limit( response['per_page'] )
    if q1.count():
        date_column = q1.subquery().columns.d
        (min_date,max_date) = Session.query(func.min(date_column), func.max(date_column)).first()
    else:
        # Impossible date range
        (min_date,max_date) = datetime.now()+timedelta(days=1),datetime.now()
    # Grouped query
    S = SnapshotOfTwitterAccount
    q = Session.query()\
            .add_column( func.max(S.followers) )\
            .add_column( func.max(S.following) )\
            .add_column( func.max(S.tweets) )\
            .add_column( date_group )\
            .add_column( S.screen_name )\
            .group_by(date_group)\
            .group_by(S.screen_name)\
            .order_by(date_group.desc())\
            .filter( date_group>=min_date )\
            .filter( date_group<=max_date )\
            .filter( accountFilter )
    results = {}
    _dictize = lambda x: {
        'followers':x[0],
        'following':x[1],
        'tweets':x[2],
        'timestamp':x[3].date().isoformat(),
        'screen_name':x[4]
    }
    for x in q:
        x = _dictize(x)
        name = x['screen_name']
        if not (name in results):
            latest = Session.query(S).filter(S.screen_name==name).order_by(S.timestamp.desc()).first()
            results[name] = { 
                    'account':{
                        'screen_name':name,
                        'followers':latest.followers,
                        'following':latest.following,
                        'tweets':latest.tweets,
                    },
                    'data':[] 
            }
        results[name]['data'].append(x)
    response['data'] = results
    response['grain'] = grain
    return response
            latest = Session.query(SnapshotOfAnalytics)\
                    .filter(SnapshotOfAnalytics.website==x['name'])\
                    .order_by(SnapshotOfAnalytics.timestamp.desc())\
                    .first()
            day = (datetime.now()-timedelta(days=1)).date()
            if latest and latest.timestamp>=day:
                if verbose: print ' -> most recent snapshot for %s has already been processed.' % x['name']
                continue
            hits = get_hits(service, profile_id, day.isoformat())
            sn = SnapshotOfAnalytics(timestamp=day,website=x['name'],hits=hits)
            Session.add(sn)
            if verbose:
                print '%s: %d' % (x['name'], hits)
        except Exception, e:
            print e
    Session.commit()

def initialize_service(googleanalytics_auth_json):
  # Create an httplib2.Http object to handle our HTTP requests.
  http = httplib2.Http()
  # Prepare credentials, and authorize HTTP object with them.
  assert googleanalytics_auth_json,'No GOOGLEANALYTICS_AUTH set in environment. This should be the sample.dat file created by authenticating a sample app with Google Analytics.\n\n  Read: https://developers.google.com/analytics/solutions/articles/hello-analytics-api'
  credentials = Credentials.new_from_json(googleanalytics_auth_json)
  if credentials is None or credentials.invalid:
    credentials = run(FLOW, storage)
  http = credentials.authorize(http)
  # Retrieve service.
  return build('analytics', 'v3', http=http)

def iterate_profiles(service):
  accounts = service.management().accounts().list().execute()