Exemple #1
0
def snapshot_mailchimp(verbose=False):
    api_key = os.environ.get('MAILCHIMP_API_KEY')
    assert api_key, 'No MAILCHIMP_API_KEY defined in environment.'
    pm = PostMonkey(api_key, timeout=10)
    ping_string = pm.ping()
    expected = u'Everything\'s Chimpy!'
    assert ping_string==expected, 'Bad handshake, got "%s", expected "%s"' % (ping_string,expected)
    if verbose:
        print 'handshake ok'
    lists = pm.lists()
    if not 'data' in lists:
        print 'Got bad lists object from server.'
        pprint(lists)
        raise ValueError('Bad lists object from server')
    # Snapshot creation code...
    today = datetime.now().date()
    for l in lists['data']:
        try:
            if verbose: print 'Scraping %s...' % l['name']
            latest = Session.query(model.SnapshotOfMailchimp)\
                    .filter(model.SnapshotOfMailchimp.name==l['name'])\
                    .order_by(model.SnapshotOfMailchimp.timestamp.desc())\
                    .first()
            if latest and latest.timestamp>=today:
                if verbose: print ' -> most recent snapshots have already been processed.'
                continue
            snapshot = model.SnapshotOfMailchimp(\
                    name = l['name'],\
                    members = l['stats']['member_count'],
                    timestamp = today)
            if verbose: print '  -> ',snapshot.toJson()
            Session.add(snapshot)
            Session.commit()
        except Exception, e:
            pprint({'list':l,'exception':str(e)})
def get_activity(verbose=False):
    lists = util.list_mailman_lists(verbose)
    for l in lists:
        if verbose:
            print "Processing activity for %s..." % l["name"]
        latest = (
            Session.query(ActivityInMailman)
            .filter(ActivityInMailman.list_name == l["name"])
            .order_by(ActivityInMailman.message_id.desc())
            .first()
        )
        # Walk through message history from the web front-end
        archive_url = l["link"].replace("mailman/listinfo", "pipermail")
        limit = 1000
        latest_id = latest.message_id if latest else -1
        for msg in _yield_messages(archive_url, latest_id, verbose=verbose):
            if verbose:
                print '  -> got msg #%d (%s: "%s")' % (msg["id"], msg["email"], msg["subject"])
            Session.add(
                ActivityInMailman(
                    list_name=l["name"],
                    message_id=msg["id"],
                    subject=msg["subject"],
                    author=msg["author"],
                    email=msg["email"],
                    link=msg["link"],
                    timestamp=msg["date"],
                )
            )
            limit -= 1
            # if limit==0:
            # if verbose: print '  -> Reached activity limit (100)'
            # break;
        Session.commit()
Exemple #3
0
def snapshot_twitteraccounts(verbose=False):
    """Create today's SnapshotOfTwitterAccounts"""
    api = open_api()
    friends = api.GetFriends()

    for friend in friends:
        if verbose: print 'Scraping %s...' % friend.screen_name
        screen_name = friend.screen_name.lower()
        if screen_name=='theannotator':
            # legacy reasons
            screen_name = 'TheAnnotator'
        followers = friend.followers_count
        following = friend.friends_count
        tweets = friend.statuses_count
        today = datetime.now().date()
        # How long since we scraped this account?
        latest = Session.query(SnapshotOfTwitterAccount)\
                .filter(SnapshotOfTwitterAccount.screen_name==screen_name)\
                .order_by(SnapshotOfTwitterAccount.timestamp.desc())\
                .first()
        if latest and latest.timestamp>=today:
            if verbose: print ' -> most recent snapshot for %s has already been processed.' % screen_name
            continue
        # Create a snapshot 
        sn = SnapshotOfTwitterAccount(\
                timestamp=today,\
                screen_name=screen_name,\
                followers=followers,\
                following=following,\
                tweets=tweets)
        Session.add(sn)
        if verbose: print '  -> ',sn.toJson()
    Session.commit()
def snapshot_twitteraccounts(verbose=False):
    """Create today's SnapshotOfTwitterAccounts"""
    api = tweepy.API()
    for screen_name in TRACKED_ACCOUNTS:
        if verbose: print 'Scraping %s...' % screen_name
        u = api.get_user(screen_name)
        followers = u.followers_count
        following = u.friends_count
        tweets = u.statuses_count
        today = datetime.now().date()
        # How long since we scraped this account?
        latest = Session.query(SnapshotOfTwitterAccount)\
                .filter(SnapshotOfTwitterAccount.screen_name==screen_name)\
                .order_by(SnapshotOfTwitterAccount.timestamp.desc())\
                .first()
        if latest and latest.timestamp>=today:
            if verbose: print ' -> most recent snapshot for %s has already been processed.' % screen_name
            continue
        # Create a snapshot 
        sn = SnapshotOfTwitterAccount(\
                timestamp=today,\
                screen_name=screen_name,\
                followers=followers,\
                following=following,\
                tweets=tweets)
        Session.add(sn)
        if verbose: print '  -> ',sn.toJson()
    Session.commit()
Exemple #5
0
def snapshot_mailman(verbose=False):
    lists = util.list_mailman_lists(verbose)
    today = datetime.now().date()
    for l in lists:
        if verbose: print 'Processing snapshots for %s...' % l['name']
        latest = Session.query(SnapshotOfMailman)\
                .filter(SnapshotOfMailman.list_name==l['name'])\
                .order_by(SnapshotOfMailman.timestamp.desc())\
                .first()
        # By default, gather 30 days of snapshots
        since = today - timedelta(days=180)
        if latest:
            if latest.timestamp>=today:
                if verbose: print ' -> most recent snapshots have already been processed.'
                continue
            since = latest.timestamp + timedelta(days=1)
        # Download subscriber list
        roster_url = l['link'].replace('listinfo','roster')
        num_subscribers = len(_scrape_subscribers(roster_url, verbose=verbose))
        # Create a snapshot of each day
        while since<today:
            posts_today = Session.query(ActivityInMailman)\
                            .filter(ActivityInMailman.list_name==l['name'])\
                            .filter(ActivityInMailman.timestamp.between(since,since+timedelta(days=1)))\
                            .count()
            sn = SnapshotOfMailman(\
                    list_name=l['name'],\
                    timestamp=since,\
                    subscribers=num_subscribers,
                    posts_today=posts_today)
            Session.add(sn)
            if verbose: print '  -> ',sn.toJson()
            since += timedelta(days=1)
        # Walk through message history, counting messages per day
        Session.commit()
Exemple #6
0
def snapshot_repos(verbose=False):
    """Create SnapshotOfRepo objects in the database for 
       every day since the last time this was run."""
    repo_list = _get_repo_list(verbose)
    today = datetime.now().date()
    for (repo_name,repo) in repo_list.items():
        if verbose: print 'Processing snapshots for %s...' % repo_name
        latest = Session.query(SnapshotOfGithub)\
                .filter(SnapshotOfGithub.repo_name==repo_name)\
                .order_by(SnapshotOfGithub.timestamp.desc())\
                .first()
        # By default, gather 30 days of snapshots
        if latest and latest.timestamp>=today:
            if verbose: print ' -> most recent snapshots have already been processed.'
            continue
        # Snapshot date for the last day (or more)
        snapshot = SnapshotOfGithub( timestamp=today, repo_name=repo_name, open_issues=repo.open_issues, size=repo.size, watchers=repo.watchers, forks=repo.forks )
        if verbose: print '  -> ',snapshot.toJson()
        Session.add(snapshot)
        Session.commit()
Exemple #7
0
def snapshot_facebook(verbose=False):
    api = facebook.GraphAPI()
    obj = api.get_object('/OKFNetwork')
    if not 'likes' in obj:
        print 'Got bad object from server.'
        pprint(obj)
        raise ValueError('Bad object from server')
    likes = obj['likes']
    if verbose:
        print 'Likes today: %d' % likes
    # Snapshot creation code...
    today = datetime.now().date()
    latest = Session.query(model.SnapshotOfFacebook)\
            .order_by(model.SnapshotOfFacebook.timestamp.desc())\
            .first()
    if latest and latest.timestamp>=today:
        if verbose: print ' -> most recent snapshots have already been processed.'
        return
    snapshot = model.SnapshotOfFacebook(likes=likes, timestamp=today)
    if verbose: print '  -> ',snapshot.toJson()
    Session.add(snapshot)
    Session.commit()
            latest = Session.query(SnapshotOfAnalytics)\
                    .filter(SnapshotOfAnalytics.website==x['name'])\
                    .order_by(SnapshotOfAnalytics.timestamp.desc())\
                    .first()
            day = (datetime.now()-timedelta(days=1)).date()
            if latest and latest.timestamp>=day:
                if verbose: print ' -> most recent snapshot for %s has already been processed.' % x['name']
                continue
            hits = get_hits(service, profile_id, day.isoformat())
            sn = SnapshotOfAnalytics(timestamp=day,website=x['name'],hits=hits)
            Session.add(sn)
            if verbose:
                print '%s: %d' % (x['name'], hits)
        except Exception, e:
            print e
    Session.commit()

def initialize_service(googleanalytics_auth_json):
  # Create an httplib2.Http object to handle our HTTP requests.
  http = httplib2.Http()
  # Prepare credentials, and authorize HTTP object with them.
  assert googleanalytics_auth_json,'No GOOGLEANALYTICS_AUTH set in environment. This should be the sample.dat file created by authenticating a sample app with Google Analytics.\n\n  Read: https://developers.google.com/analytics/solutions/articles/hello-analytics-api'
  credentials = Credentials.new_from_json(googleanalytics_auth_json)
  if credentials is None or credentials.invalid:
    credentials = run(FLOW, storage)
  http = credentials.authorize(http)
  # Retrieve service.
  return build('analytics', 'v3', http=http)

def iterate_profiles(service):
  accounts = service.management().accounts().list().execute()