def getHNData(verbose=False, limit=100, sub="showstories"): from hackernews import HackerNews from hackernews import settings import hoverpy, time, os dbpath = "data/hn.%s.db" % sub with hoverpy.HoverPy(recordMode="once", dbpath=dbpath) as hp: if not hp.mode() == "capture": settings.supported_api_versions[ "v0"] = "http://hacker-news.firebaseio.com/v0/" hn = HackerNews() titles = [] print("GETTING HACKERNEWS %s DATA" % sub) subs = { "showstories": hn.show_stories, "askstories": hn.ask_stories, "jobstories": hn.job_stories, "topstories": hn.top_stories } start = time.time() for story_id in subs[sub](limit=limit): story = hn.get_item(story_id) if verbose: print(story.title.lower()) titles.append(story.title.lower()) print("got %i hackernews titles in %f seconds" % (len(titles), time.time() - start)) return titles
def getRedditData(verbose=False, comments=True, limit=100, sub="all"): import hoverpy, praw, time dbpath = ("data/reddit.%s.db" % sub) with hoverpy.HoverPy(recordMode='once', dbpath=dbpath, httpsToHttp=True) as hp: titles = [] print "GETTING REDDIT r/%s DATA" % sub r = praw.Reddit(user_agent="Karma breakdown 1.0 by /u/_Daimon_", http_proxy=hp.httpProxy(), https_proxy=hp.httpProxy(), validate_certs="off") if not hp.mode() == "capture": r.config.api_request_delay = 0 subreddit = r.get_subreddit(sub) for submission in subreddit.get_hot(limit=limit): text = submission.title.lower() if comments: flat_comments = praw.helpers.flatten_tree(submission.comments) for comment in flat_comments: text += comment.body + " " if hasattr(comment, 'body') else '' if verbose: print text titles.append(text) return titles
import time import hoverpy import requests import os prot = "http" if os.path.isfile("hn.db") else "https" with hoverpy.HoverPy(recordMode='once', dbpath='hn.db') as hp: print("started hoverpy in %s mode" % hp.mode()) start = time.time() r = requests.get("%s://hacker-news.firebaseio.com/v0/topstories.json" % (prot)) for item in r.json(): print( requests.get("%s://hacker-news.firebaseio.com/v0/item/%i.json" % (prot, item)).json()["title"]) print("got articles in %f seconds" % (time.time() - start))
import hoverpy import praw import os import time sub = "python" db = ("%s.db" % sub) capture = not os.path.isfile(db) with hoverpy.HoverPy(dbpath=db, recordMode='once') as hp: start = time.time() titles = [] print "GETTING REDDIT r/%s DATA" % sub r = praw.Reddit(user_agent="Karma breakdown 1.0 by /u/_Daimon_", http_proxy=hp.httpProxy(), https_proxy=hp.httpsProxy(), validate_certs="off") if not capture: r.config.api_request_delay = 0 subreddit = r.get_subreddit(sub) for submission in subreddit.get_hot(limit=100): text = submission.title.lower() print(text) for comment in praw.helpers.flatten_tree(submission.comments): if hasattr(comment, 'body'): text += comment.body + " " titles.append(text) print("got %i %s in %f" % (len(titles), sub, time.time() - start))
import time import hoverpy import requests rtd = "http://readthedocs.org/api/v1/project/?limit=50&offset=0&format=json" with hoverpy.HoverPy(recordMode='once'): start = time.time() objects = requests.get(rtd).json()['objects'] links = ["http://readthedocs.org" + x['resource_uri'] for x in objects] for link in links: response = requests.get(link) print("url: %s, status code: %s" % (link, response.status_code)) print("Time taken: %f" % (time.time() - start))