def _save_preferences(api, data_directory): def save(preferences_json, file_name): file_path = os.path.join(data_directory, file_name) with open(file_path, 'w') as file: file.write(json.dumps(preferences_json)) save(api.fetch_preferences(), 'preferences.json') save(api.fetch_stream_preferences(), 'stream-preferences.json') save( [g.to_json() for g in api.fetch_sharing_groups()], 'sharing-groups.json') save(api.fetch_sharing_acl().to_json(), 'sharing-acl.json') save(api.fetch_user_info().to_json(), 'user-info.json')
def _save_preferences(api, data_directory): def save(preferences_json, file_name): file_path = os.path.join(data_directory, file_name) with open(file_path, 'w') as file: file.write(json.dumps(preferences_json)) save(api.fetch_preferences(), 'preferences.json') save(api.fetch_stream_preferences(), 'stream-preferences.json') save([g.to_json() for g in api.fetch_sharing_groups()], 'sharing-groups.json') save(api.fetch_sharing_acl().to_json(), 'sharing-acl.json') save(api.fetch_user_info().to_json(), 'user-info.json')
def main(): base.log.init() base.atom.init() parser = argparse.ArgumentParser(description="Comprehensive archive of a Google Reader account") # Credentials parser.add_argument( "--use_client_login", action="store_true", help="Instead of OAuth, use ClientLogin for " "authentication. You will be prompted for a " "username and password", ) parser.add_argument( "--oauth_refresh_token", default="", help="A previously obtained refresh token (used to bypass " "OAuth setup" ) parser.add_argument( "--account", default="", help="Google Account to save the archive for. Omit to " "specify via standard input" ) parser.add_argument( "--password", default="", help="Password for the account. Omit to specify via " "standard input" ) # Output options parser.add_argument("--output_directory", default="./", help="Directory where to place archive data.") # Fetching options parser.add_argument( "--stream_items_chunk_size", type=int, default=10000, help="Number of items refs to request per stream items " "API call (higher is more efficient)", ) parser.add_argument( "--max_items_per_stream", type=int, default=0, help="If non-zero, will cap the number of items that are " "fetched per feed or tag", ) parser.add_argument( "--item_bodies_chunk_size", type=int, default=250, help="Number of items refs per request for fetching their " "bodies (higher is more efficient)", ) parser.add_argument( "--comments_chunk_size", type=int, default=250, help="Number of items per request for fetching comments " "on shared items (higher is more efficient)", ) parser.add_argument( "--max_streams", type=int, default=0, help="Maxmium number of streams to archive (0 for no" "limit, only mean to be used for development)", ) parser.add_argument("--parallelism", type=int, default=10, help="Number of requests to make in parallel.") parser.add_argument( "--http_retry_count", type=int, default=1, help="Number of retries to make in the case of HTTP " "request errors.", ) # Miscellaneous. parser.add_argument( "--additional_item_refs_file_path", default="", help="Path to JSON file listing additional tag item refs " "to fetch", ) args = parser.parse_args() output_directory = base.paths.normalize(args.output_directory) base.paths.ensure_exists(output_directory) def output_sub_directory(name): directory_path = os.path.join(output_directory, name) base.paths.ensure_exists(directory_path) return directory_path api_responses_directory = output_sub_directory("_raw_data") streams_directory = output_sub_directory("streams") data_directory = output_sub_directory("data") items_directory = output_sub_directory("items") comments_directory = output_sub_directory("comments") if args.use_client_login: authenticated_url_fetcher = base.url_fetcher.ClientLoginUrlFetcher(args.account, args.password) else: authenticated_url_fetcher = base.url_fetcher.OAuthUrlFetcher(args.oauth_refresh_token) api = base.api.Api( authenticated_url_fetcher=authenticated_url_fetcher, http_retry_count=args.http_retry_count, cache_directory=api_responses_directory, ) user_info = api.fetch_user_info() logging.info("Created API instance for %s (%s)", user_info.user_id, user_info.email) logging.info("Saving preferences") _save_preferences(api, data_directory) logging.info("Gathering streams to fetch") stream_ids = _get_stream_ids(api, user_info.user_id, data_directory) if args.max_streams and len(stream_ids) > args.max_streams: stream_ids = stream_ids[: args.max_streams] logging.info("%d streams to fetch, gathering item refs:", len(stream_ids)) fetched_stream_ids = [0] def report_item_refs_progress(stream_id, item_refs): if item_refs is None: logging.error(" Could not load item refs from %s", stream_id) return fetched_stream_ids[0] += 1 logging.info( " Loaded %s item refs from %s, %d streams left.", "{:,}".format(len(item_refs)), stream_id, len(stream_ids) - fetched_stream_ids[0], ) item_refs_responses = base.worker.do_work( lambda: FetchItemRefsWorker(api, args.stream_items_chunk_size, args.max_items_per_stream), stream_ids, args.parallelism, report_progress=report_item_refs_progress, ) if args.additional_item_refs_file_path: _load_additional_item_refs( base.paths.normalize(args.additional_item_refs_file_path), stream_ids, item_refs_responses, user_info.user_id, ) item_ids = set() known_item_ids_in_compact_form = set() item_refs_total = 0 for stream_id, item_refs in itertools.izip(stream_ids, item_refs_responses): if not item_refs: continue item_ids.update([item_ref.item_id for item_ref in item_refs]) item_refs_total += len(item_refs) if stream_id == base.api.EXPLORE_STREAM_ID: base.api.not_found_items_ids_to_ignore.update([i.item_id for i in item_refs]) stream = base.api.Stream(stream_id=stream_id, item_refs=item_refs) stream_file_name = base.paths.stream_id_to_file_name(stream_id) + ".json" stream_file_path = os.path.join(streams_directory, stream_file_name) with open(stream_file_path, "w") as stream_file: stream_file.write(json.dumps(stream.to_json())) item_ids = list(item_ids) logging.info( "%s unique items refs (%s total), getting item bodies:", "{:,}".format(len(item_ids)), "{:,}".format(item_refs_total), ) # We have two different chunking goals: # - Fetch items in large-ish chunks (ideally 250), to minimize HTTP request # overhead per item # - Write items in small-ish chunks (ideally around 10) per file, since having # a file per item is too annoying to deal with from a file-system # perspective. We also need the chunking into files to be deterministic, so # that from an item ID we know what file to look for it in. # We therefore first chunk the IDs by file path, and then group those chunks # into ID chunks that we fetch. # We write the file chunks immediately after fetching to decrease the # in-memory working set of the script. item_ids_by_path = {} for item_id in item_ids: item_id_file_path = base.paths.item_id_to_file_path(items_directory, item_id) item_ids_by_path.setdefault(item_id_file_path, []).append(item_id) current_item_ids_chunk = [] item_ids_chunks = [current_item_ids_chunk] for item_ids_for_file_path in item_ids_by_path.values(): if len(current_item_ids_chunk) + len(item_ids_for_file_path) > args.item_bodies_chunk_size: current_item_ids_chunk = [] item_ids_chunks.append(current_item_ids_chunk) current_item_ids_chunk.extend(item_ids_for_file_path) item_bodies_to_fetch = len(item_ids) fetched_item_bodies = [0] def report_item_bodies_progress(_, count): if count is None: return fetched_item_bodies[0] += count logging.info( " Fetched %s/%s item bodies", "{:,}".format(fetched_item_bodies[0]), "{:,}".format(item_bodies_to_fetch) ) base.worker.do_work( lambda: FetchWriteItemBodiesWorker(api, items_directory), item_ids_chunks, args.parallelism, report_progress=report_item_bodies_progress, ) broadcast_stream_ids = [ stream_id for stream_id in stream_ids if stream_id.startswith("user/") and stream_id.endswith("/state/com.google/broadcast") ] logging.info("Fetching comments from %d shared item streams.", len(broadcast_stream_ids)) encoded_sharers = api.fetch_encoded_sharers() remaining_broadcast_stream_ids = [len(broadcast_stream_ids)] def report_comments_progress(_, comments_by_item_id): if comments_by_item_id is None: return remaining_broadcast_stream_ids[0] -= 1 comment_count = sum((len(c) for c in comments_by_item_id.values()), 0) logging.info( " Fetched %s comments, %s shared items streams left.", "{:,}".format(comment_count), "{:,}".format(remaining_broadcast_stream_ids[0]), ) all_comments = {} comments_for_broadcast_streams = base.worker.do_work( lambda: FetchCommentsWorker(api, encoded_sharers, args.comments_chunk_size), broadcast_stream_ids, args.parallelism, report_progress=report_comments_progress, ) total_comment_count = 0 for comments_for_broadcast_stream in comments_for_broadcast_streams: if not comments_for_broadcast_stream: continue for item_id, comments in comments_for_broadcast_stream.iteritems(): total_comment_count += len(comments) all_comments.setdefault(item_id, []).extend(comments) logging.info( "Writing %s comments from %s items.", "{:,}".format(total_comment_count), "{:,}".format(len(all_comments)) ) for item_id, comments in all_comments.items(): item_comments_file_path = os.path.join( base.paths.item_id_to_file_path(comments_directory, item_id), item_id.compact_form() ) base.paths.ensure_exists(os.path.dirname(item_comments_file_path)) with open(item_comments_file_path, "w") as item_comments_file: item_comments_file.write(json.dumps([c.to_json() for c in comments])) with open(os.path.join(output_directory, "README"), "w") as readme_file: readme_file.write("See https://github.com/mihaip/readerisdead/" "wiki/reader_archive-Format.\n")
def main(): base.log.init() base.atom.init() parser = argparse.ArgumentParser( description='Comprehensive archive of a Google Reader account') # Credentials parser.add_argument('--use_client_login' ,action='store_true', help='Instead of OAuth, use ClientLogin for ' 'authentication. You will be prompted for a ' 'username and password') parser.add_argument('--oauth_refresh_token', default='', help='A previously obtained refresh token (used to bypass ' 'OAuth setup') parser.add_argument('--account', default='', help='Google Account to save the archive for. Omit to ' 'specify via standard input') parser.add_argument('--password', default='', help='Password for the account. Omit to specify via ' 'standard input') # Output options parser.add_argument('--output_directory', default='./', help='Directory where to place archive data.') # Fetching options parser.add_argument('--stream_items_chunk_size', type=int, default=10000, help='Number of items refs to request per stream items ' 'API call (higher is more efficient)') parser.add_argument('--max_items_per_stream', type=int, default=0, help='If non-zero, will cap the number of items that are ' 'fetched per feed or tag') parser.add_argument('--item_bodies_chunk_size', type=int, default=250, help='Number of items refs per request for fetching their ' 'bodies (higher is more efficient)') parser.add_argument('--comments_chunk_size', type=int, default=250, help='Number of items per request for fetching comments ' 'on shared items (higher is more efficient)') parser.add_argument('--max_streams', type=int, default=0, help='Maxmium number of streams to archive (0 for no' 'limit, only mean to be used for development)') parser.add_argument('--parallelism', type=int, default=10, help='Number of requests to make in parallel.') parser.add_argument('--http_retry_count', type=int, default=1, help='Number of retries to make in the case of HTTP ' 'request errors.') # Miscellaneous. parser.add_argument('--additional_item_refs_file_path', default='', help='Path to JSON file listing additional tag item refs ' 'to fetch') args = parser.parse_args() output_directory = base.paths.normalize(args.output_directory) base.paths.ensure_exists(output_directory) def output_sub_directory(name): directory_path = os.path.join(output_directory, name) base.paths.ensure_exists(directory_path) return directory_path api_responses_directory = output_sub_directory('_raw_data') streams_directory = output_sub_directory('streams') data_directory = output_sub_directory('data') items_directory = output_sub_directory('items') comments_directory = output_sub_directory('comments') if args.use_client_login: authenticated_url_fetcher = base.url_fetcher.ClientLoginUrlFetcher( args.account, args.password) else: authenticated_url_fetcher = base.url_fetcher.OAuthUrlFetcher( args.oauth_refresh_token) api = base.api.Api( authenticated_url_fetcher=authenticated_url_fetcher, http_retry_count=args.http_retry_count, cache_directory=api_responses_directory) user_info = api.fetch_user_info() logging.info( 'Created API instance for %s (%s)', user_info.user_id, user_info.email) logging.info('Saving preferences') _save_preferences(api, data_directory) logging.info('Gathering streams to fetch') stream_ids = _get_stream_ids(api, user_info.user_id, data_directory) if args.max_streams and len(stream_ids) > args.max_streams: stream_ids = stream_ids[:args.max_streams] logging.info('%d streams to fetch, gathering item refs:', len(stream_ids)) fetched_stream_ids = [0] def report_item_refs_progress(stream_id, item_refs): if item_refs is None: logging.error(' Could not load item refs from %s', stream_id) return fetched_stream_ids[0] += 1 logging.info(' Loaded %s item refs from %s, %d streams left.', '{:,}'.format(len(item_refs)), stream_id, len(stream_ids) - fetched_stream_ids[0]) item_refs_responses = base.worker.do_work( lambda: FetchItemRefsWorker( api, args.stream_items_chunk_size, args.max_items_per_stream), stream_ids, args.parallelism, report_progress=report_item_refs_progress) if args.additional_item_refs_file_path: _load_additional_item_refs( base.paths.normalize(args.additional_item_refs_file_path), stream_ids, item_refs_responses, user_info.user_id) item_ids = set() known_item_ids_in_compact_form = set() item_refs_total = 0 for stream_id, item_refs in itertools.izip(stream_ids, item_refs_responses): if not item_refs: continue item_ids.update([item_ref.item_id for item_ref in item_refs]) item_refs_total += len(item_refs) if stream_id == base.api.EXPLORE_STREAM_ID: base.api.not_found_items_ids_to_ignore.update( [i.item_id for i in item_refs]) stream = base.api.Stream(stream_id=stream_id, item_refs=item_refs) stream_file_name = base.paths.stream_id_to_file_name(stream_id) + '.json' stream_file_path = os.path.join(streams_directory, stream_file_name) with open(stream_file_path, 'w') as stream_file: stream_file.write(json.dumps(stream.to_json())) item_ids = list(item_ids) logging.info('%s unique items refs (%s total), getting item bodies:', '{:,}'.format(len(item_ids)), '{:,}'.format(item_refs_total)) # We have two different chunking goals: # - Fetch items in large-ish chunks (ideally 250), to minimize HTTP request # overhead per item # - Write items in small-ish chunks (ideally around 10) per file, since having # a file per item is too annoying to deal with from a file-system # perspective. We also need the chunking into files to be deterministic, so # that from an item ID we know what file to look for it in. # We therefore first chunk the IDs by file path, and then group those chunks # into ID chunks that we fetch. # We write the file chunks immediately after fetching to decrease the # in-memory working set of the script. item_ids_by_path = {} for item_id in item_ids: item_id_file_path = base.paths.item_id_to_file_path( items_directory, item_id) item_ids_by_path.setdefault(item_id_file_path, []).append(item_id) current_item_ids_chunk = [] item_ids_chunks = [current_item_ids_chunk] for item_ids_for_file_path in item_ids_by_path.values(): if len(current_item_ids_chunk) + len(item_ids_for_file_path) > \ args.item_bodies_chunk_size: current_item_ids_chunk = [] item_ids_chunks.append(current_item_ids_chunk) current_item_ids_chunk.extend(item_ids_for_file_path) item_bodies_to_fetch = len(item_ids) fetched_item_bodies = [0] missing_item_bodies = set() def report_item_bodies_progress(requested_item_ids, found_item_ids): if found_item_ids is None: missing_item_bodies.update(requested_item_ids.difference( base.api.not_found_items_ids_to_ignore)) return fetched_item_bodies[0] += len(found_item_ids) missing_item_bodies.update( set(requested_item_ids).difference(set(found_item_ids)).difference( base.api.not_found_items_ids_to_ignore)) logging.info(' Fetched %s/%s item bodies (%s could not be loaded)', '{:,}'.format(fetched_item_bodies[0]), '{:,}'.format(item_bodies_to_fetch), '{:,}'.format(len(missing_item_bodies))) base.worker.do_work( lambda: FetchWriteItemBodiesWorker(api, items_directory), item_ids_chunks, args.parallelism, report_progress=report_item_bodies_progress) if missing_item_bodies: logging.warn('Item bodies could not be loaded for: %s', ', '.join([i.compact_form() for i in missing_item_bodies])) broadcast_stream_ids = [ stream_id for stream_id in stream_ids if stream_id.startswith('user/') and stream_id.endswith('/state/com.google/broadcast') ] logging.info( 'Fetching comments from %d shared item streams.', len(broadcast_stream_ids)) encoded_sharers = api.fetch_encoded_sharers() remaining_broadcast_stream_ids = [len(broadcast_stream_ids)] def report_comments_progress(_, comments_by_item_id): if comments_by_item_id is None: return remaining_broadcast_stream_ids[0] -= 1 comment_count = sum((len(c) for c in comments_by_item_id.values()), 0) logging.info(' Fetched %s comments, %s shared items streams left.', '{:,}'.format(comment_count), '{:,}'.format(remaining_broadcast_stream_ids[0])) all_comments = {} comments_for_broadcast_streams = base.worker.do_work( lambda: FetchCommentsWorker( api, encoded_sharers, args.comments_chunk_size), broadcast_stream_ids, args.parallelism, report_progress=report_comments_progress) total_comment_count = 0 for comments_for_broadcast_stream in comments_for_broadcast_streams: if not comments_for_broadcast_stream: continue for item_id, comments in comments_for_broadcast_stream.iteritems(): total_comment_count += len(comments) all_comments.setdefault(item_id, []).extend(comments) logging.info('Writing %s comments from %s items.', '{:,}'.format(total_comment_count), '{:,}'.format(len(all_comments))) for item_id, comments in all_comments.items(): item_comments_file_path = os.path.join(base.paths.item_id_to_file_path( comments_directory, item_id), item_id.compact_form()) base.paths.ensure_exists(os.path.dirname(item_comments_file_path)) with open(item_comments_file_path, 'w') as item_comments_file: item_comments_file.write(json.dumps([c.to_json() for c in comments])) with open(os.path.join(output_directory, 'README'), 'w') as readme_file: readme_file.write('See https://github.com/mihaip/readerisdead/' 'wiki/reader_archive-Format.\n')
def main(): base.log.init() base.atom.init() parser = argparse.ArgumentParser( description='Comprehensive archive of a Google Reader account') # Credentials parser.add_argument('--use_client_login' ,action='store_true', help='Instead of OAuth, use ClientLogin for ' 'authentication. You will be prompted for a ' 'username and password') parser.add_argument('--oauth_refresh_token', default='', help='A previously obtained refresh token (used to bypass ' 'OAuth setup') parser.add_argument('--account', default='', help='Google Account to save the archive for. Omit to ' 'specify via standard input') parser.add_argument('--password', default='', help='Password for the account. Omit to specify via ' 'standard input') # Output options parser.add_argument('--output_directory', default='./', help='Directory where to place archive data.') # Fetching options parser.add_argument('--stream_items_chunk_size', type=int, default=10000, help='Number of items refs to request per stream items ' 'API call (higher is more efficient)') parser.add_argument('--max_items_per_stream', type=int, default=0, help='If non-zero, will cap the number of items that are ' 'fetched per feed or tag') parser.add_argument('--item_bodies_chunk_size', type=int, default=250, help='Number of items refs per request for fetching their ' 'bodies (higher is more efficient)') parser.add_argument('--comments_chunk_size', type=int, default=250, help='Number of items per request for fetching comments ' 'on shared items (higher is more efficient)') parser.add_argument('--max_streams', type=int, default=0, help='Maxmium number of streams to archive (0 for no' 'limit, only mean to be used for development)') parser.add_argument('--parallelism', type=int, default=10, help='Number of requests to make in parallel.') parser.add_argument('--http_retry_count', type=int, default=1, help='Number of retries to make in the case of HTTP ' 'request errors.') # Miscellaneous. parser.add_argument('--additional_item_refs_file_path', default='', help='Path to JSON file listing additional tag item refs ' 'to fetch') args = parser.parse_args() output_directory = base.paths.normalize(args.output_directory) base.paths.ensure_exists(output_directory) def output_sub_directory(name): directory_path = os.path.join(output_directory, name) base.paths.ensure_exists(directory_path) return directory_path api_responses_directory = output_sub_directory('_raw_data') streams_directory = output_sub_directory('streams') data_directory = output_sub_directory('data') items_directory = output_sub_directory('items') comments_directory = output_sub_directory('comments') if args.use_client_login: authenticated_url_fetcher = base.url_fetcher.ClientLoginUrlFetcher( args.account, args.password) else: authenticated_url_fetcher = base.url_fetcher.OAuthUrlFetcher( args.oauth_refresh_token) api = base.api.Api( authenticated_url_fetcher=authenticated_url_fetcher, http_retry_count=args.http_retry_count, cache_directory=api_responses_directory) user_info = api.fetch_user_info() logging.info( 'Created API instance for %s (%s)', user_info.user_id, user_info.email) logging.info('Saving preferences') _save_preferences(api, data_directory) logging.info('Gathering streams to fetch') stream_ids = _get_stream_ids(api, user_info.user_id, data_directory) if args.max_streams and len(stream_ids) > args.max_streams: stream_ids = stream_ids[:args.max_streams] logging.info('%d streams to fetch, gathering item refs:', len(stream_ids)) item_ids, item_refs_total = _fetch_and_save_item_refs( stream_ids, api, args, streams_directory, user_info.user_id) logging.info('%s unique items refs (%s total), getting item bodies:', '{:,}'.format(len(item_ids)), '{:,}'.format(item_refs_total)) item_ids_chunks = _chunk_item_ids(item_ids, args.item_bodies_chunk_size) item_bodies_to_fetch = len(item_ids) fetched_item_bodies = [0] missing_item_bodies = set() def report_item_bodies_progress(requested_item_ids, found_item_ids): if found_item_ids is None: missing_item_bodies.update(set(requested_item_ids).difference( base.api.not_found_items_ids_to_ignore)) return fetched_item_bodies[0] += len(found_item_ids) missing_item_bodies.update( set(requested_item_ids).difference(set(found_item_ids)).difference( base.api.not_found_items_ids_to_ignore)) logging.info(' Fetched %s/%s item bodies (%s could not be loaded)', '{:,}'.format(fetched_item_bodies[0]), '{:,}'.format(item_bodies_to_fetch), '{:,}'.format(len(missing_item_bodies))) base.worker.do_work( lambda: FetchWriteItemBodiesWorker(api, items_directory), item_ids_chunks, args.parallelism, report_progress=report_item_bodies_progress) if missing_item_bodies: logging.warn('Item bodies could not be loaded for: %s', ', '.join([i.compact_form() for i in missing_item_bodies])) broadcast_stream_ids = [ stream_id for stream_id in stream_ids if stream_id.startswith('user/') and stream_id.endswith('/state/com.google/broadcast') ] logging.info( 'Fetching comments from %d shared item streams.', len(broadcast_stream_ids)) encoded_sharers = api.fetch_encoded_sharers() remaining_broadcast_stream_ids = [len(broadcast_stream_ids)] def report_comments_progress(_, comments_by_item_id): if comments_by_item_id is None: return remaining_broadcast_stream_ids[0] -= 1 comment_count = sum((len(c) for c in comments_by_item_id.values()), 0) logging.info(' Fetched %s comments, %s shared items streams left.', '{:,}'.format(comment_count), '{:,}'.format(remaining_broadcast_stream_ids[0])) all_comments = {} comments_for_broadcast_streams = base.worker.do_work( lambda: FetchCommentsWorker( api, encoded_sharers, args.comments_chunk_size), broadcast_stream_ids, args.parallelism, report_progress=report_comments_progress) total_comment_count = 0 for comments_for_broadcast_stream in comments_for_broadcast_streams: if not comments_for_broadcast_stream: continue for item_id, comments in comments_for_broadcast_stream.iteritems(): total_comment_count += len(comments) all_comments.setdefault(item_id, []).extend(comments) logging.info('Writing %s comments from %s items.', '{:,}'.format(total_comment_count), '{:,}'.format(len(all_comments))) for item_id, comments in all_comments.items(): item_comments_file_path = os.path.join(base.paths.item_id_to_file_path( comments_directory, item_id), item_id.compact_form()) base.paths.ensure_exists(os.path.dirname(item_comments_file_path)) with open(item_comments_file_path, 'w') as item_comments_file: item_comments_file.write(json.dumps([c.to_json() for c in comments])) with open(os.path.join(output_directory, 'README'), 'w') as readme_file: readme_file.write('See https://github.com/mihaip/readerisdead/' 'wiki/reader_archive-Format.\n')
def main(): base.log.init() base.atom.init() parser = argparse.ArgumentParser( description='Comprehensive archive of a Google Reader account') # Credentials parser.add_argument('--use_client_login', action='store_true', help='Instead of OAuth, use ClientLogin for ' 'authentication. You will be prompted for a ' 'username and password') parser.add_argument( '--oauth_refresh_token', default='', help='A previously obtained refresh token (used to bypass ' 'OAuth setup') parser.add_argument('--account', default='', help='Google Account to save the archive for. Omit to ' 'specify via standard input') parser.add_argument('--password', default='', help='Password for the account. Omit to specify via ' 'standard input') # Output options parser.add_argument('--output_directory', default='./', help='Directory where to place archive data.') # Fetching options parser.add_argument( '--stream_items_chunk_size', type=int, default=10000, help='Number of items refs to request per stream items ' 'API call (higher is more efficient)') parser.add_argument( '--max_items_per_stream', type=int, default=0, help='If non-zero, will cap the number of items that are ' 'fetched per feed or tag') parser.add_argument( '--item_bodies_chunk_size', type=int, default=250, help='Number of items refs per request for fetching their ' 'bodies (higher is more efficient)') parser.add_argument( '--comments_chunk_size', type=int, default=250, help='Number of items per request for fetching comments ' 'on shared items (higher is more efficient)') parser.add_argument('--max_streams', type=int, default=0, help='Maxmium number of streams to archive (0 for no' 'limit, only mean to be used for development)') parser.add_argument('--parallelism', type=int, default=10, help='Number of requests to make in parallel.') parser.add_argument('--http_retry_count', type=int, default=1, help='Number of retries to make in the case of HTTP ' 'request errors.') # Miscellaneous. parser.add_argument( '--additional_item_refs_file_path', default='', help='Path to JSON file listing additional tag item refs ' 'to fetch') args = parser.parse_args() output_directory = base.paths.normalize(args.output_directory) base.paths.ensure_exists(output_directory) def output_sub_directory(name): directory_path = os.path.join(output_directory, name) base.paths.ensure_exists(directory_path) return directory_path api_responses_directory = output_sub_directory('_raw_data') streams_directory = output_sub_directory('streams') data_directory = output_sub_directory('data') items_directory = output_sub_directory('items') comments_directory = output_sub_directory('comments') if args.use_client_login: authenticated_url_fetcher = base.url_fetcher.ClientLoginUrlFetcher( args.account, args.password) else: authenticated_url_fetcher = base.url_fetcher.OAuthUrlFetcher( args.oauth_refresh_token) api = base.api.Api(authenticated_url_fetcher=authenticated_url_fetcher, http_retry_count=args.http_retry_count, cache_directory=api_responses_directory) user_info = api.fetch_user_info() logging.info('Created API instance for %s (%s)', user_info.user_id, user_info.email) logging.info('Saving preferences') _save_preferences(api, data_directory) logging.info('Gathering streams to fetch') stream_ids = _get_stream_ids(api, user_info.user_id, data_directory) if args.max_streams and len(stream_ids) > args.max_streams: stream_ids = stream_ids[:args.max_streams] logging.info('%d streams to fetch, gathering item refs:', len(stream_ids)) item_ids, item_refs_total = _fetch_and_save_item_refs( stream_ids, api, args, streams_directory, user_info.user_id) logging.info('%s unique items refs (%s total), grouping by chunk.', '{:,}'.format(len(item_ids)), '{:,}'.format(item_refs_total)) logging.info('Grouped item refs, getting item bodies:') item_ids_chunks = _chunk_item_ids(item_ids, args.item_bodies_chunk_size) item_bodies_to_fetch = len(item_ids) fetched_item_bodies = [0] missing_item_bodies = set() def report_item_bodies_progress(requested_item_ids, found_item_ids): if found_item_ids is None: missing_item_bodies.update( set(requested_item_ids).difference( base.api.not_found_items_ids_to_ignore)) return fetched_item_bodies[0] += len(found_item_ids) missing_item_bodies.update( set(requested_item_ids).difference(set(found_item_ids)).difference( base.api.not_found_items_ids_to_ignore)) logging.info(' Fetched %s/%s item bodies (%s could not be loaded)', '{:,}'.format(fetched_item_bodies[0]), '{:,}'.format(item_bodies_to_fetch), '{:,}'.format(len(missing_item_bodies))) base.worker.do_work( lambda: FetchWriteItemBodiesWorker(api, items_directory), item_ids_chunks, args.parallelism, report_progress=report_item_bodies_progress) if missing_item_bodies: logging.warn( 'Item bodies could not be loaded for: %s', ', '.join([i.compact_form() for i in missing_item_bodies])) broadcast_stream_ids = [ stream_id for stream_id in stream_ids if stream_id.startswith('user/') and stream_id.endswith('/state/com.google/broadcast') ] logging.info('Fetching comments from %d shared item streams.', len(broadcast_stream_ids)) encoded_sharers = api.fetch_encoded_sharers() remaining_broadcast_stream_ids = [len(broadcast_stream_ids)] def report_comments_progress(_, comments_by_item_id): if comments_by_item_id is None: return remaining_broadcast_stream_ids[0] -= 1 comment_count = sum((len(c) for c in comments_by_item_id.values()), 0) logging.info(' Fetched %s comments, %s shared items streams left.', '{:,}'.format(comment_count), '{:,}'.format(remaining_broadcast_stream_ids[0])) all_comments = {} comments_for_broadcast_streams = base.worker.do_work( lambda: FetchCommentsWorker(api, encoded_sharers, args. comments_chunk_size), broadcast_stream_ids, args.parallelism, report_progress=report_comments_progress) total_comment_count = 0 for comments_for_broadcast_stream in comments_for_broadcast_streams: if not comments_for_broadcast_stream: continue for item_id, comments in comments_for_broadcast_stream.iteritems(): total_comment_count += len(comments) all_comments.setdefault(item_id, []).extend(comments) logging.info('Writing %s comments from %s items.', '{:,}'.format(total_comment_count), '{:,}'.format(len(all_comments))) for item_id, comments in all_comments.items(): item_comments_file_path = os.path.join( base.paths.item_id_to_file_path(comments_directory, item_id), item_id.compact_form()) base.paths.ensure_exists(os.path.dirname(item_comments_file_path)) with open(item_comments_file_path, 'w') as item_comments_file: item_comments_file.write( json.dumps([c.to_json() for c in comments])) with open(os.path.join(output_directory, 'README'), 'w') as readme_file: readme_file.write('See https://github.com/mihaip/readerisdead/' 'wiki/reader_archive-Format.\n')