def checkItems_(self, sender): NSLog("Checking items") if self.user is None or self.password is None: return #Retrieve all items ca = ClientAuthMethod(self.user,self.password) reader = GoogleReader(ca) reader.makeSpecialFeeds() specials = reader.getSpecialFeed(ReaderUrl.READING_LIST) specials.loadItems() if self.lastId is None: item = specials.items[0] self.lastId = item.id self.notifyNewItem(item) else: if len(specials.items) > 0: lastItem = specials.items[0].id for item in specials.items: if item.id == self.lastId: break self.notifyNewItem(item) self.lastId = lastItem NSLog("Finished Checking items")
def test_add_remove_single_feed_tag(self): ca = ClientAuthMethod(username,password) reader = GoogleReader(ca) container = SpecialFeed(reader, ReaderUrl.READING_LIST) container.loadItems() tag_name = 'test-single-tag' feed_1 = container.items[0] # assert tag doesn't exist yet self.assertFalse(any([tag_name in x for x in feed_1.data['categories']])) # add tag reader.addItemTag(feed_1, 'user/-/label/' + tag_name) #reload now container.clearItems() container.loadItems() feed_2 = container.items[0] # assert tag is in new self.assertTrue(any([tag_name in x for x in feed_2.data['categories']])) # remove tag reader.removeItemTag(feed_2, 'user/-/label/' + tag_name) #reload now container.clearItems() container.loadItems() feed_3 = container.items[0] # assert tag is removed self.assertFalse(any([tag_name in x for x in feed_3.data['categories']]))
def __init__(self, username, password, method='kmeans'): auth = ClientAuthMethod(username, password) self.reader = GoogleReader(auth) self.reader.buildSubscriptionList() self.categories = self.reader.getCategories() self.corpus = Corpus() self.method = method
def main(): auth = OAuthMethod(oauth_key, oauth_secret) auth.setRequestToken() print auth.buildAuthUrl() raw_input() auth.setAccessToken() reader = GoogleReader(auth) print reader.getUserInfo()
def test_reading_list_exists(self): ca = ClientAuthMethod(username,password) reader = GoogleReader(ca) reader.makeSpecialFeeds() feeds = reader.getFeedContent(reader.getSpecialFeed(ReaderUrl.READING_LIST)) self.assertEqual(dict, type(feeds)) list_match = re.search('reading list in Google Reader', feeds['title']) self.assertTrue(list_match)
def getGoogleFeeds(auth): try: google = GoogleReader(auth) except IOError: #handle username/password error #redirect to homepage with error? pass if google.buildSubscriptionList(): return google.getFeeds()
def test_full_auth_process_without_callback(self): auth = OAuthMethod(oauth_key, oauth_secret) auth.setRequestToken() auth_url = auth.buildAuthUrl() response = automated_oauth_approval(auth_url) auth.setAccessToken() reader = GoogleReader(auth) info = reader.getUserInfo() self.assertEqual(dict, type(info)) self.assertEqual(firstname, info['userName'])
def test_full_auth_and_access_userdata(self): auth = OAuth2Method(client_id, client_secret) auth.setRedirectUri(redirect_url) url = auth.buildAuthUrl() token = automated_oauth2_approval(url) auth.code = token auth.setAccessToken() reader = GoogleReader(auth) info = reader.getUserInfo() self.assertEqual(dict, type(info)) self.assertEqual(firstname, info['userName'])
def login(): try: username, password = open('auth.txt').read().strip().split('\n') except: username = raw_input('username? ') password = getpass.getpass('password? ') open('auth.txt','w').write("{0}\n{1}\n".format(username,password)) auth = ClientAuthMethod(username, password) reader = GoogleReader(auth) reader.buildSubscriptionList() return reader
def login(): """ requires an 'auth.txt' to exist in the current directory, this file contains two lines, the first line is your username, the second line is your password. """ username, password = open('auth.txt').read().strip().split('\n') auth = ClientAuthMethod(username, password) reader = GoogleReader(auth) a = reader.buildSubscriptionList() if not a: return False else: return reader
def test_transaction_add_feed_tags(self): ca = ClientAuthMethod(username, password) reader = GoogleReader(ca) container = SpecialFeed(reader, ReaderUrl.READING_LIST) container.loadItems() tags = ['test-transaction%s' % x for x in range(5)] feed_1 = container.items[0] reader.beginAddItemTagTransaction() for tag in tags: reader.addItemTag(feed_1, 'user/-/label/' + tag) reader.commitAddItemTagTransaction() #reload now container.clearItems() container.loadItems() feed_2 = container.items[0] # figure out if all tags were returned tags_exist = [ any(map(lambda tag: tag in x, tags)) for x in feed_2.data['categories'] ] tag_exist_count = sum([1 for x in tags_exist if x]) self.assertEqual(5, tag_exist_count)
def run(self, user, loadLimit, *args, **kwargs): # user.credential should always be valid when doing oauth2 if user.credential: credential = user.credential auth = OAuth2Method(credential.client_id, credential.client_secret) auth.authFromAccessToken(credential.access_token) auth.setActionToken() # username/password auth method, should only be used by our tests elif len(args) == 2: auth = ClientAuthMethod(args[0], args[1]) reader = GoogleReader(auth) try: reader.buildSubscriptionList() except TypeError, exc: SyncFromReaderAPITask().retry(exc=exc)
def test_loading_item_count(self): ca = ClientAuthMethod(username, password) reader = GoogleReader(ca) container = SpecialFeed(reader, ReaderUrl.READING_LIST) container.loadItems(loadLimit=5) self.assertEqual(5, len(container.items)) self.assertEqual(5, container.countItems())
def __init__(self, username, password): self.auth = ClientAuthMethod(username, password) self.reader = GoogleReader(self.auth) self.reader.makeSpecialFeeds() # Fetches list of special feeds like starred self.reader.buildSubscriptionList() # Fetches list of feeds AND categories self.categories = self.reader.categories self.feeds = self.reader.feeds[:] # Make a copy so lib calls don't feel it with crap self.specialFeeds = self.reader.specialFeeds.copy()
def test_marking_read(self): ca = ClientAuthMethod(username, password) reader = GoogleReader(ca) container = SpecialFeed(reader, ReaderUrl.READING_LIST) container.loadItems() feed_item = container.items[0] self.assertTrue(feed_item.markRead()) self.assertTrue(feed_item.isRead())
def get(self): user = users.get_current_user() if user: storage = StorageByKeyName(Credentials, user.user_id(), 'credentials') credentials = storage.get() auth = GAPDecoratorAuthMethod(credentials) reader = GoogleReader(auth) reader.buildSubscriptionList() feeds = reader.getFeeds() template_values = { 'feeds': feeds } path = os.path.join(os.path.dirname(__file__), 'templates/template_slist.html') self.response.out.write(template.render(path, template_values)) else: self.redirect(users.create_login_url(self.request.uri))
def test_full_auth_process_with_callback(self): auth = OAuthMethod(oauth_key, oauth_secret) #must be a working callback url for testing auth.setCallback("http://www.asktherelic.com") token, token_secret = auth.setAndGetRequestToken() auth_url = auth.buildAuthUrl() #callback section #get response, which is a redirect to the callback url response = automated_oauth_approval(auth_url) query_string = urlparse.urlparse(response.geturl()).query #grab the verifier token from the callback url query string token_verifier = urlparse.parse_qs(query_string)['oauth_verifier'][0] auth.setAccessTokenFromCallback(token, token_secret, token_verifier) reader = GoogleReader(auth) info = reader.getUserInfo() self.assertEqual(dict, type(info)) self.assertEqual(firstname, info['userName'])
def main(): if len(sys.argv) <= 1 or len(sys.argv) > 3: print("Usage: %s username [password]" % (sys.argv[0])) return 1 username = sys.argv[1] if len(sys.argv) == 2: sys.stderr.write('Password for %s: ' % username) password = raw_input() else: password = sys.argv[2] auth = ClientAuthMethod(username, password) reader = GoogleReader(auth) root = ET.Element('opml') head = ET.SubElement(root, 'head') ET.SubElement(head, 'title').text = \ '%s subscriptions in Google Reader' % username body = ET.SubElement(root, 'body') category_els = {} reader.buildSubscriptionList() for feed in reader.getSubscriptionList(): if feed.getCategories(): for category in feed.getCategories(): # Create category element if not category.id in category_els: category_el = ET.SubElement(body, 'outline') category_el.set('text', category.label) category_el.set('title', category.label) category_els[category.id] = category_el make_feed_el(feed, category_els[category.id]) else: make_feed_el(feed, body) tree = ET.ElementTree(root) tree.write(sys.stdout, xml_declaration=True)
def main(): if len(sys.argv) <= 1 or len(sys.argv) > 3: print ("Usage: %s username [password]" % (sys.argv[0])) return 1 username = sys.argv[1] if len(sys.argv) == 2: sys.stderr.write("Password for %s: " % username) password = raw_input() else: password = sys.argv[2] auth = ClientAuthMethod(username, password) reader = GoogleReader(auth) root = ET.Element("opml") head = ET.SubElement(root, "head") ET.SubElement(head, "title").text = "%s subscriptions in Google Reader" % username body = ET.SubElement(root, "body") category_els = {} reader.buildSubscriptionList() for feed in reader.getSubscriptionList(): if feed.getCategories(): for category in feed.getCategories(): # Create category element if not category.id in category_els: category_el = ET.SubElement(body, "outline") category_el.set("text", category.label) category_el.set("title", category.label) category_els[category.id] = category_el make_feed_el(feed, category_els[category.id]) else: make_feed_el(feed, body) tree = ET.ElementTree(root) tree.write(sys.stdout, xml_declaration=True)
def test_reading_list_exists(self): ca = ClientAuthMethod(username, password) reader = GoogleReader(ca) reader.makeSpecialFeeds() feeds = reader.getFeedContent( reader.getSpecialFeed(ReaderUrl.READING_LIST)) self.assertEqual(dict, type(feeds)) list_match = re.search('reading list in Google Reader', feeds['title']) self.assertTrue(list_match)
def test_subscribe_unsubscribe(self): ca = ClientAuthMethod(username, password) reader = GoogleReader(ca) slashdot = 'feed/http://rss.slashdot.org/Slashdot/slashdot' #unsubscribe always return true; revert feedlist state self.assertTrue(reader.unsubscribe(slashdot)) # now subscribe self.assertTrue(reader.subscribe(slashdot)) # wait for server to update time.sleep(1) reader.buildSubscriptionList() # test subscribe successful self.assertIn(slashdot, [x.id for x in reader.getSubscriptionList()])
def test_subscribe_unsubscribe(self): ca = ClientAuthMethod(username,password) reader = GoogleReader(ca) slashdot = 'feed/http://rss.slashdot.org/Slashdot/slashdot' #unsubscribe always return true; revert feedlist state self.assertTrue(reader.unsubscribe(slashdot)) # now subscribe self.assertTrue(reader.subscribe(slashdot)) # wait for server to update time.sleep(1) reader.buildSubscriptionList() # test subscribe successful self.assertIn(slashdot, [x.id for x in reader.getSubscriptionList()])
def test_oauth_subscribe(self): auth = OAuth2Method(client_id, client_secret) auth.setRedirectUri(redirect_url) url = auth.buildAuthUrl() token = automated_oauth2_approval(url) auth.code = token auth.setAccessToken() auth.setActionToken() reader = GoogleReader(auth) slashdot = 'feed/http://rss.slashdot.org/Slashdot/slashdot' #unsubscribe always return true; revert feedlist state self.assertTrue(reader.unsubscribe(slashdot)) # now subscribe self.assertTrue(reader.subscribe(slashdot)) # wait for server to update import time time.sleep(1) reader.buildSubscriptionList() # test subscribe successful self.assertIn(slashdot, [x.id for x in reader.getSubscriptionList()])
def test_transaction_add_feed_tags(self): ca = ClientAuthMethod(username,password) reader = GoogleReader(ca) container = SpecialFeed(reader, ReaderUrl.READING_LIST) container.loadItems() tags = ['test-transaction%s' % x for x in range(5)] feed_1 = container.items[0] reader.beginAddItemTagTransaction() for tag in tags: reader.addItemTag(feed_1, 'user/-/label/' + tag) reader.commitAddItemTagTransaction() #reload now container.clearItems() container.loadItems() feed_2 = container.items[0] # figure out if all tags were returned tags_exist = [any(map(lambda tag: tag in x, tags)) for x in feed_2.data['categories']] tag_exist_count = sum([1 for x in tags_exist if x]) self.assertEqual(5, tag_exist_count)
def test_add_remove_single_feed_tag(self): ca = ClientAuthMethod(username, password) reader = GoogleReader(ca) container = SpecialFeed(reader, ReaderUrl.READING_LIST) container.loadItems() tag_name = 'test-single-tag' feed_1 = container.items[0] # assert tag doesn't exist yet self.assertFalse( any([tag_name in x for x in feed_1.data['categories']])) # add tag reader.addItemTag(feed_1, 'user/-/label/' + tag_name) #reload now container.clearItems() container.loadItems() feed_2 = container.items[0] # assert tag is in new self.assertTrue(any([tag_name in x for x in feed_2.data['categories']])) # remove tag reader.removeItemTag(feed_2, 'user/-/label/' + tag_name) #reload now container.clearItems() container.loadItems() feed_3 = container.items[0] # assert tag is removed self.assertFalse( any([tag_name in x for x in feed_3.data['categories']]))
def import_google_reader_begin(user_id, access_token): auth = OAuth2Method(settings.GOOGLE_OAUTH2_CLIENT_ID, settings.GOOGLE_OAUTH2_CLIENT_SECRET) auth.authFromAccessToken(access_token) reader = GoogleReader(auth) django_user, mongo_user = get_user_from_dbs(user_id) username = django_user.username try: user_infos = reader.getUserInfo() except TypeError: LOGGER.exception(u'Could not start Google Reader import for user %s.', username) # Don't refresh, it's now done by a dedicated periodic task. # If we failed, it means the problem is quite serious. # import_google_reader_trigger(user_id, refresh=True) return GR_MAX_FEEDS = config.GR_MAX_FEEDS LOGGER.info(u'Starting Google Reader import for user %s.', username) gri = GoogleReaderImport(user_id) # take note of user informations now that we have them. gri.start(user_infos=user_infos) reader.buildSubscriptionList() total_reads, reg_date = reader.totalReadItems(without_date=False) total_starred, star1_date = reader.totalStarredItems(without_date=False) total_feeds = len(reader.feeds) + 1 # +1 for 'starred' gri.reg_date(pytime.mktime(reg_date.timetuple())) gri.star1_date(pytime.mktime(star1_date.timetuple())) gri.total_reads(total_reads) gri.total_starred(total_starred) LOGGER.info(u'Google Reader import for user %s: %s feed(s) and %s read ' u'article(s) to go…', username, total_feeds, total_reads) if total_feeds > GR_MAX_FEEDS and not settings.DEBUG: mail_admins('User {0} has more than {1} feeds: {2}!'.format( username, GR_MAX_FEEDS, total_feeds), u"\n\nThe GR import will be incomplete.\n\n" u"Just for you to know…\n\n") # We launch the starred feed import first. Launching it after the # standard feeds makes it being delayed until the world's end. reader.makeSpecialFeeds() starred_feed = reader.getSpecialFeed(ReaderUrl.STARRED_LIST) import_google_reader_starred.apply_async((user_id, username, starred_feed), queue='low') processed_feeds = 1 feeds_to_import = [] for gr_feed in reader.feeds[:GR_MAX_FEEDS]: try: feed = create_feed(gr_feed, mongo_user) except Feed.DoesNotExist: LOGGER.exception(u'Could not create feed “%s” for user %s, ' u'skipped.', gr_feed.title, username) continue processed_feeds += 1 feeds_to_import.append((user_id, username, gr_feed, feed)) LOGGER.info(u'Imported feed “%s” (%s/%s) for user %s…', gr_feed.title, processed_feeds, total_feeds, username) # We need to clamp the total, else task won't finish in # the case where the user has more feeds than allowed. # gri.total_feeds(min(processed_feeds, GR_MAX_FEEDS)) for feed_args in feeds_to_import: import_google_reader_articles.apply_async(feed_args, queue='low') LOGGER.info(u'Imported %s/%s feeds in %s. Articles import already ' u'started with limits: date: %s, %s waves of %s articles, ' u'max articles: %s, reads: %s, starred: %s.', processed_feeds, total_feeds, naturaldelta(now() - gri.start()), naturaltime(max([gri.reg_date(), GR_OLDEST_DATE])), config.GR_WAVE_LIMIT, config.GR_LOAD_LIMIT, config.GR_MAX_ARTICLES, total_reads, total_starred)
subscription.itemsById = {} subscription.loadMoreItems(loadLimit=load) except Exception: logging.warning(sys.exc_info()[0]) print "Continuing..." sys.exc_clear() loadMoreUntilSuccessful(subscription, load) couch = couchdb.Server('http://localhost:5984') try: db = couch[db_name] except Exception: db = couch.create(db_name) auth = ClientAuthMethod(username, password) reader = GoogleReader(auth) ll = 100 reader.buildSubscriptionList() l = reader.getSubscriptionList() for li in l: print li.title.encode('ascii', 'xmlcharrefreplace') loadUntilSuccessful(li, ll) while li.lastLoadLength > 0: bulk_upload = [] for i in li.getItems(): doc = {} doc = i.data if len(li.categories) > 0: doc["label"] = li.categories[0].label bulk_upload.append(doc)
'redirect_server_port') ######################### # move this is a method! if not os.path.isfile(CACHED_CA_FILE) or not os.path.isfile( CACHED_READER_FILE): topickle = True elif ((time.time() - os.path.getmtime(CACHED_CA_FILE)) / 60) > 10: # older than 10 mins so refresh topickle = True else: topickle = False if topickle: ca = ClientAuth(greader_user, greader_pass) reader = GoogleReader(ca) reader.buildSubscriptionList() pickle.dump(ca, open(CACHED_CA_FILE, 'wb'), -1) pickle.dump(reader, open(CACHED_READER_FILE, 'wb'), -1) else: ca = pickle.load(open(CACHED_CA_FILE)) reader = pickle.load(open(CACHED_READER_FILE)) ######################### params = get_params() url = None id = None name = None mode = None try: url = urllib.unquote_plus(params["url"])
from libgreader import GoogleReader, ClientAuthMethod, Feed auth = ClientAuthMethod('acpigeon', 'Katana666') reader = GoogleReader(auth) print reader.getUserInfo()
def test_reader_user_info(self): ca = ClientAuthMethod(username,password) reader = GoogleReader(ca) info = reader.getUserInfo() self.assertEqual(dict, type(info)) self.assertEqual(firstname, info['userName'])
USE_REDIRECT_SERVER = __settings__.getSetting('use_redirect_server') REDIRECT_SERVER_URL = 'http://127.0.0.1:' + __settings__.getSetting('redirect_server_port') ######################### # move this is a method! if not os.path.isfile(CACHED_CA_FILE) or not os.path.isfile(CACHED_READER_FILE): topickle = True elif ((time.time() - os.path.getmtime(CACHED_CA_FILE))/60) > 10: # older than 10 mins so refresh topickle = True else: topickle = False if topickle: ca = ClientAuth( greader_user, greader_pass ) reader = GoogleReader( ca ) reader.buildSubscriptionList() pickle.dump(ca, open(CACHED_CA_FILE,'wb'), -1) pickle.dump(reader, open(CACHED_READER_FILE,'wb'), -1) else: ca = pickle.load(open(CACHED_CA_FILE)) reader = pickle.load(open(CACHED_READER_FILE)) ######################### params=get_params() url=None id=None name=None mode=None try: url=urllib.unquote_plus(params["url"])
class DM_GReader(): def __init__(self, username, password, method='kmeans'): auth = ClientAuthMethod(username, password) self.reader = GoogleReader(auth) self.reader.buildSubscriptionList() self.categories = self.reader.getCategories() self.corpus = Corpus() self.method = method def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3): """Import the specific category to a Pattern Corpus for future calculation. category_id: the integer indicates which category to use. cont: the integer tells how many queries to issue to continuously crawl the GReader. path: the location for storing the pickle of the Pattern Corpus. local: to use the local stored corpus? max_articles: the number of max articles we try to crawl if one day's subscriptions is too much.""" if path is None: print "Please provide with a path to store/load local pickle file." return if local: self.corpus = Corpus.load(path) return self.target_category = self.categories[category_id] continuation = None # Crawl only the data within one day time_threadshold = calendar.timegm( (datetime.date.today() - datetime.timedelta(days=days)).timetuple()) i = 1 while 1 and i < (max_articles / 20): self.target_category_content = self.reader.getCategoryContent( self.target_category, continuation=continuation) feeds = self.target_category_content[u'items'] if self.target_category_content['updated'] < time_threadshold: break feeds_docs = [] for feed in feeds: doc_name = feed[u'id'][-16:] for content in [u'content', u'summary']: if content in feed: feed_soup = BeautifulSoup(feed[content][u'content']) feed_text = feed_soup.get_text() feeds_docs.append( Document(feed_text, stemmer=LEMMA, name=doc_name)) break self.corpus.extend(feeds_docs) if u'continuation' in self.target_category_content and self.target_category_content[ u'continuation'] is not None: continuation = self.target_category_content[u'continuation'] else: print 'Finished!' break print 'Retrieving %d articles...' % (i * 20) i = i + 1 self.corpus.save(path, update=True) def _generate_clusters(self, k=10, p=0.8, maxlevel=10): """Use KMEANS method by default, and choose the initial k values by KMPP method. k is the number of clusters. p is to control the error of KMEANS, when p=1.0 is faster with small error. """ if self.method == "kmeans": from pattern.vector import KMEANS, KMPP self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10) doc_list = [] # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid. for cluster in self.clusters: c = centroid(cluster) d_min = (cluster[0].vector, c) for doc in cluster: d = distance(doc.vector, c) if distance(doc.vector, c) < d_min: d_min = d doc_min = doc doc_list.append(doc_min) self.centroids = [i.name for i in doc_list] self.clusters = [[i.name for i in cluster] for cluster in self.clusters] elif self.method == 'covertree': def mydistance(doc_name1, doc_name2): v1 = self.corpus.document(doc_name1).vector v2 = self.corpus.document(doc_name2).vector return distance(v1, v2) self.covertree = Covertree(mydistance, maxlevel) for i, doc in enumerate(self.corpus): tree_node = myTree(doc.name) self.covertree.insert(tree_node, self.covertree.ct, 0) self.covertree.merge_levels() self.centroids, self.clusters = self.covertree.clustering_from_ct( k) def generate_repr_ids(self, k): """ For each cluster, we choose an arbitary article as the cluster's representative. Return the ids of the article, here the document name is the article's id. Google Reader is using "i=http://www.google.com/reader/api/0/stream/items/contents" to get the content of a specific data. Now we use the centroid to represent the documents """ self._generate_clusters(k) return self.centroids def cost(self): cost = 0 for i, center in enumerate(self.centroids): for doc in self.clusters[i]: cost += distance( self.corpus.document(doc).vector, self.corpus.document(center).vector) return cost def get_article_content(self, ids): """ Use the ids to find the content of the articles through google web content API """ url = 'http://www.google.com/reader/api/0/stream/items/contents' id_handle = 'tag:google.com,2005:reader/item/%s' contents = [] for _id in ids: r = requests.post(url, data={'i': (id_handle % _id)}) contents.append(r.json) return contents def generate_htmls(self, k, ids): """ Use the ids and k to generate htmls """ htmls = {} for i in self.get_article_content(ids): feed = i['items'][0] for content in [u'content', u'summary']: if content in feed: title = feed['title'] url = feed['alternate'][0]['href'] htmls[title] = url return htmls
def import_google_reader_begin(user_id, access_token): auth = OAuth2Method(settings.GOOGLE_OAUTH2_CLIENT_ID, settings.GOOGLE_OAUTH2_CLIENT_SECRET) auth.authFromAccessToken(access_token) reader = GoogleReader(auth) django_user, mongo_user = get_user_from_dbs(user_id) username = django_user.username try: user_infos = reader.getUserInfo() except TypeError: LOGGER.exception(u'Could not start Google Reader import for user %s.', username) # Don't refresh, it's now done by a dedicated periodic task. # If we failed, it means the problem is quite serious. # import_google_reader_trigger(user_id, refresh=True) return GR_MAX_FEEDS = config.GR_MAX_FEEDS LOGGER.info(u'Starting Google Reader import for user %s.', username) gri = GoogleReaderImport(user_id) # take note of user informations now that we have them. gri.start(user_infos=user_infos) reader.buildSubscriptionList() total_reads, reg_date = reader.totalReadItems(without_date=False) total_starred, star1_date = reader.totalStarredItems(without_date=False) total_feeds = len(reader.feeds) + 1 # +1 for 'starred' gri.reg_date(pytime.mktime(reg_date.timetuple())) gri.star1_date(pytime.mktime(star1_date.timetuple())) gri.total_reads(total_reads) gri.total_starred(total_starred) LOGGER.info( u'Google Reader import for user %s: %s feed(s) and %s read ' u'article(s) to go…', username, total_feeds, total_reads) if total_feeds > GR_MAX_FEEDS and not settings.DEBUG: mail_admins( 'User {0} has more than {1} feeds: {2}!'.format( username, GR_MAX_FEEDS, total_feeds), u"\n\nThe GR import will be incomplete.\n\n" u"Just for you to know…\n\n") # We launch the starred feed import first. Launching it after the # standard feeds makes it being delayed until the world's end. reader.makeSpecialFeeds() starred_feed = reader.getSpecialFeed(ReaderUrl.STARRED_LIST) import_google_reader_starred.apply_async((user_id, username, starred_feed), queue='low') processed_feeds = 1 feeds_to_import = [] for gr_feed in reader.feeds[:GR_MAX_FEEDS]: try: feed = create_feed(gr_feed, mongo_user) except Feed.DoesNotExist: LOGGER.exception( u'Could not create feed “%s” for user %s, ' u'skipped.', gr_feed.title, username) continue processed_feeds += 1 feeds_to_import.append((user_id, username, gr_feed, feed)) LOGGER.info(u'Imported feed “%s” (%s/%s) for user %s…', gr_feed.title, processed_feeds, total_feeds, username) # We need to clamp the total, else task won't finish in # the case where the user has more feeds than allowed. # gri.total_feeds(min(processed_feeds, GR_MAX_FEEDS)) for feed_args in feeds_to_import: import_google_reader_articles.apply_async(feed_args, queue='low') LOGGER.info( u'Imported %s/%s feeds in %s. Articles import already ' u'started with limits: date: %s, %s waves of %s articles, ' u'max articles: %s, reads: %s, starred: %s.', processed_feeds, total_feeds, naturaldelta(now() - gri.start()), naturaltime(max([gri.reg_date(), GR_OLDEST_DATE])), config.GR_WAVE_LIMIT, config.GR_LOAD_LIMIT, config.GR_MAX_ARTICLES, total_reads, total_starred)
def test_reader(self): ca = ClientAuthMethod(username,password) reader = GoogleReader(ca) self.assertNotEqual(reader, None)
class DM_GReader(): def __init__(self, username, password, method='kmeans'): auth = ClientAuthMethod(username, password) self.reader = GoogleReader(auth) self.reader.buildSubscriptionList() self.categories = self.reader.getCategories() self.corpus = Corpus() self.method = method def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3): """Import the specific category to a Pattern Corpus for future calculation. category_id: the integer indicates which category to use. cont: the integer tells how many queries to issue to continuously crawl the GReader. path: the location for storing the pickle of the Pattern Corpus. local: to use the local stored corpus? max_articles: the number of max articles we try to crawl if one day's subscriptions is too much.""" if path is None: print "Please provide with a path to store/load local pickle file." return if local: self.corpus = Corpus.load(path) return self.target_category = self.categories[category_id] continuation = None # Crawl only the data within one day time_threadshold = calendar.timegm((datetime.date.today() - datetime.timedelta(days=days)).timetuple()) i = 1 while 1 and i < (max_articles / 20): self.target_category_content = self.reader.getCategoryContent(self.target_category, continuation=continuation) feeds = self.target_category_content[u'items'] if self.target_category_content['updated'] < time_threadshold: break feeds_docs = [] for feed in feeds: doc_name = feed[u'id'][-16:] for content in [u'content', u'summary']: if content in feed: feed_soup = BeautifulSoup(feed[content][u'content']) feed_text = feed_soup.get_text() feeds_docs.append(Document(feed_text, stemmer=LEMMA, name=doc_name)) break self.corpus.extend(feeds_docs) if u'continuation' in self.target_category_content and self.target_category_content[u'continuation'] is not None: continuation = self.target_category_content[u'continuation'] else: print 'Finished!' break print 'Retrieving %d articles...' % (i * 20) i = i + 1 self.corpus.save(path, update=True) def _generate_clusters(self, k=10, p=0.8, maxlevel=10): """Use KMEANS method by default, and choose the initial k values by KMPP method. k is the number of clusters. p is to control the error of KMEANS, when p=1.0 is faster with small error. """ if self.method == "kmeans": from pattern.vector import KMEANS, KMPP self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10) doc_list = [] # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid. for cluster in self.clusters: c = centroid(cluster) d_min = (cluster[0].vector, c) for doc in cluster: d = distance(doc.vector, c) if distance(doc.vector, c) < d_min: d_min = d doc_min = doc doc_list.append(doc_min) self.centroids = [i.name for i in doc_list] self.clusters = [[i.name for i in cluster] for cluster in self.clusters] elif self.method == 'covertree': def mydistance(doc_name1, doc_name2): v1 = self.corpus.document(doc_name1).vector v2 = self.corpus.document(doc_name2).vector return distance(v1, v2) self.covertree = Covertree(mydistance, maxlevel) for i, doc in enumerate(self.corpus): tree_node = myTree(doc.name) self.covertree.insert(tree_node, self.covertree.ct, 0) self.covertree.merge_levels() self.centroids, self.clusters = self.covertree.clustering_from_ct(k) def generate_repr_ids(self, k): """ For each cluster, we choose an arbitary article as the cluster's representative. Return the ids of the article, here the document name is the article's id. Google Reader is using "i=http://www.google.com/reader/api/0/stream/items/contents" to get the content of a specific data. Now we use the centroid to represent the documents """ self._generate_clusters(k) return self.centroids def cost(self): cost = 0 for i, center in enumerate(self.centroids): for doc in self.clusters[i]: cost += distance(self.corpus.document(doc).vector, self.corpus.document(center).vector) return cost def get_article_content(self, ids): """ Use the ids to find the content of the articles through google web content API """ url = 'http://www.google.com/reader/api/0/stream/items/contents' id_handle = 'tag:google.com,2005:reader/item/%s' contents = [] for _id in ids: r = requests.post(url, data={'i': (id_handle % _id)}) contents.append(r.json) return contents def generate_htmls(self, k, ids): """ Use the ids and k to generate htmls """ htmls = {} for i in self.get_article_content(ids): feed = i['items'][0] for content in [u'content', u'summary']: if content in feed: title = feed['title'] url = feed['alternate'][0]['href'] htmls[title] = url return htmls
class HooverReader(object): ''' Export everything that was saved in Google Reader as JSON objects. Keep as much information as possible, but especially ID (useful for cross-referencing), title, url, notes (probably gone) and read status. Each file should contain entries for just one category/tag. List of categories (folders in Google Reader) should be stores in categories.json. Script has no memory and will always fetch everything (doesn't do incremental updates). Script will NOT save list of feeds since those can be exported as OPML from Google Reader. If it hits rate limit, then it will pause up to half an hour before giving up. DILEMMAS: - Should we save feeds contents? How far? - Should we save categories contents? How far? (probably; categories can contain entries labeled with category label that are not otherwise tagged and hence not backed up) Save: - all tagged entries (labeled feeds; categories that don't contain feeds) - list of categories with feeds they contain Algorithm: - fetch a list of categories - fetch a list of all labels (which includes categories) - for every label which is not a category: - loadItems - execute loadMoreItems until items count remains same (or error) - dump data as JSON to file - for every category fetch a list of feeds it contains - dump the list of categories with feeds as JSON to a file ''' def __init__(self, username, password): self.auth = ClientAuthMethod(username, password) self.reader = GoogleReader(self.auth) self.reader.makeSpecialFeeds() # Fetches list of special feeds like starred self.reader.buildSubscriptionList() # Fetches list of feeds AND categories self.categories = self.reader.categories self.feeds = self.reader.feeds[:] # Make a copy so lib calls don't feel it with crap self.specialFeeds = self.reader.specialFeeds.copy() def __create_feed_filename(self, feed_label): return "{0}.json".format(feed_label) def get_tags(self): tags_json = self.reader.httpGet( 'https://www.google.com/reader/api/0/tag/list', {'output': 'json'}) tags = json.loads(tags_json) tags_list = tags['tags'] self.tags = tags_list def load_items(self, feed): fetch_size = 1000 tryagain = 0 feed.loadItems(loadLimit=fetch_size) while (feed.lastLoadLength > 0 and feed.lastLoadLength == fetch_size) \ or (tryagain > 0 and tryagain < 5): feed.loadMoreItems(loadLimit=fetch_size) if not feed.lastLoadOk: print "Error fetching items for feed '{0}'".format( feed.title) pause_for = PAUSE_INTERVAL * (2 ** tryagain) print "Pausing for a {0} minute(s)...".format(pause_for / 60) # Double time to sleep on each iteration time.sleep(pause_for) tryagain += 1 else: tryagain = 0 return feed.items def process_item(self, item): values = {} keys = ('id', 'title', 'content', 'read', 'starred', 'shared', 'url') for key in keys: values[key] = getattr(item, key, u'') values['origin'] = getattr(item, 'origin', {}) return values def get_feed_info(self, feed): feed_obj = { 'feed_id': feed.id, 'title': feed.title, 'site_url': getattr(feed, "siteUrl", ""), 'feed_url': getattr(feed, "feedUrl", ""), 'last_updated': feed.lastUpdated, # Unix timestamp; updated when feed is fetched } return feed_obj def save_to_file(self, filename, obj, subdir=None): save_dir = BACKUP_DIR if subdir: save_dir = join(BACKUP_DIR, subdir) if not os.path.exists(save_dir): try: os.makedirs(save_dir) except: # Could not create it print 'Could not create backup directory {0}. Exiting.'.format( save_dir) sys.exit(1) obj_json = toJSON(obj) fname = join(save_dir, filename) with open(fname, 'w') as f: f.write(obj_json) def save_feed(self, feed, subdir=None): items = [] print 'Saving:', feed.title.encode('utf-8') try: raw_items = self.load_items(feed) except: print 'Failed. Moving on...' print return for item in raw_items: items.append(self.process_item(item)) feed_obj = self.get_feed_info(feed) feed_obj['items'] = items feed_obj['items_count'] = len(items) self.save_to_file(self.__create_feed_filename(feed.title), feed_obj, subdir) def process_category(self, category): cat = { 'id': category.id, 'title': category.label, } cat['feeds'] = [self.get_feed_info(feed) for feed in category.feeds] return cat def save_tag(self, tag): cat = { 'id': tag.id, 'title': tag.label, } print 'Saving:', tag.label.encode('utf-8') cat['items'] = [self.process_item(item) for item in self.load_items(tag)] cat['items_count'] = len(cat['items']) self.save_to_file(self.__create_feed_filename(cat['title']), cat, 'tags') def save_categories(self): categories = { 'title': 'Google Reader Categories' } categories['categories'] = [self.process_category(cat) for cat in self.categories] if len(categories['categories']): self.save_to_file("categories.json", categories) else: print 'There are no categories to save.' def save_feed_list(self): feeds = { 'title': 'Google Reader List of Feeds' } feeds_list = [] for feed in self.feeds: feeds_list.append(self.get_feed_info(feed)) feeds['feeds'] = feeds_list if len(feeds['feeds']): self.save_to_file("feeds.json", feeds) else: print 'There are no feeds to save.' def backup(self): if getattr(settings, 'SAVE_TAGS', True): print "Saving tags..." self.get_tags() for tag in self.tags: # Tag is really a category try: label = tag['id'].rsplit('label/')[1] except: # Special feeds (state/); skip, they are handled separately continue ctag = Category(self.reader, label, tag['id']) self.save_tag(ctag) if getattr(settings, 'SAVE_FEEDS', False): print "Saving feeds..." for feed in self.feeds: self.save_feed(feed, 'feeds') print "Saving special feeds..." if getattr(settings, 'SAVE_SPECIAL_FEEDS_ALL', False): sf_keys = self.specialFeeds.keys() else: sf_keys = ('starred', ) for feed_name in sf_keys: feed = self.specialFeeds[feed_name] self.save_feed(feed, 'special') if getattr(settings, 'SAVE_CATEGORIES', True): print "Saving list of feeds and categories..." self.save_feed_list() self.save_categories()
from libgreader import GoogleReader, ClientAuthMethod username = "******" password = "******" ca = ClientAuthMethod(username, password) reader = GoogleReader(ca) info = reader.getUserInfo() print info x = reader.buildSubscriptionList() if x == "True": print ("Subscription list built.") else: print ("Something went wrong with building the subscription list.") print ("Printing feed objects...") print reader.feeds # reader.feeds is a list, so we can play around wtih individual objects print reader.feeds[0] # Each list item has some properties associated with it print reader.feeds[0].id print reader.feeds[0].title # List comprehension for returning feed id print [x.id for x in reader.feeds]