Ejemplo n.º 1
0
def main():
    base.log.init()
    base.atom.init()

    parser = argparse.ArgumentParser(description="Comprehensive archive of a Google Reader account")

    # Credentials
    parser.add_argument(
        "--use_client_login",
        action="store_true",
        help="Instead of OAuth, use ClientLogin for "
        "authentication. You will be prompted for a "
        "username and password",
    )
    parser.add_argument(
        "--oauth_refresh_token", default="", help="A previously obtained refresh token (used to bypass " "OAuth setup"
    )
    parser.add_argument(
        "--account", default="", help="Google Account to save the archive for. Omit to " "specify via standard input"
    )
    parser.add_argument(
        "--password", default="", help="Password for the account. Omit to specify via " "standard input"
    )

    # Output options
    parser.add_argument("--output_directory", default="./", help="Directory where to place archive data.")

    # Fetching options
    parser.add_argument(
        "--stream_items_chunk_size",
        type=int,
        default=10000,
        help="Number of items refs to request per stream items " "API call (higher is more efficient)",
    )
    parser.add_argument(
        "--max_items_per_stream",
        type=int,
        default=0,
        help="If non-zero, will cap the number of items that are " "fetched per feed or tag",
    )
    parser.add_argument(
        "--item_bodies_chunk_size",
        type=int,
        default=250,
        help="Number of items refs per request for fetching their " "bodies (higher is more efficient)",
    )
    parser.add_argument(
        "--comments_chunk_size",
        type=int,
        default=250,
        help="Number of items per request for fetching comments " "on shared items (higher is more efficient)",
    )
    parser.add_argument(
        "--max_streams",
        type=int,
        default=0,
        help="Maxmium number of streams to archive (0 for no" "limit, only mean to be used for development)",
    )
    parser.add_argument("--parallelism", type=int, default=10, help="Number of requests to make in parallel.")
    parser.add_argument(
        "--http_retry_count",
        type=int,
        default=1,
        help="Number of retries to make in the case of HTTP " "request errors.",
    )

    # Miscellaneous.
    parser.add_argument(
        "--additional_item_refs_file_path",
        default="",
        help="Path to JSON file listing additional tag item refs " "to fetch",
    )

    args = parser.parse_args()

    output_directory = base.paths.normalize(args.output_directory)
    base.paths.ensure_exists(output_directory)

    def output_sub_directory(name):
        directory_path = os.path.join(output_directory, name)
        base.paths.ensure_exists(directory_path)
        return directory_path

    api_responses_directory = output_sub_directory("_raw_data")
    streams_directory = output_sub_directory("streams")
    data_directory = output_sub_directory("data")
    items_directory = output_sub_directory("items")
    comments_directory = output_sub_directory("comments")

    if args.use_client_login:
        authenticated_url_fetcher = base.url_fetcher.ClientLoginUrlFetcher(args.account, args.password)
    else:
        authenticated_url_fetcher = base.url_fetcher.OAuthUrlFetcher(args.oauth_refresh_token)
    api = base.api.Api(
        authenticated_url_fetcher=authenticated_url_fetcher,
        http_retry_count=args.http_retry_count,
        cache_directory=api_responses_directory,
    )

    user_info = api.fetch_user_info()
    logging.info("Created API instance for %s (%s)", user_info.user_id, user_info.email)

    logging.info("Saving preferences")
    _save_preferences(api, data_directory)

    logging.info("Gathering streams to fetch")
    stream_ids = _get_stream_ids(api, user_info.user_id, data_directory)
    if args.max_streams and len(stream_ids) > args.max_streams:
        stream_ids = stream_ids[: args.max_streams]
    logging.info("%d streams to fetch, gathering item refs:", len(stream_ids))

    fetched_stream_ids = [0]

    def report_item_refs_progress(stream_id, item_refs):
        if item_refs is None:
            logging.error("  Could not load item refs from %s", stream_id)
            return
        fetched_stream_ids[0] += 1
        logging.info(
            "  Loaded %s item refs from %s, %d streams left.",
            "{:,}".format(len(item_refs)),
            stream_id,
            len(stream_ids) - fetched_stream_ids[0],
        )

    item_refs_responses = base.worker.do_work(
        lambda: FetchItemRefsWorker(api, args.stream_items_chunk_size, args.max_items_per_stream),
        stream_ids,
        args.parallelism,
        report_progress=report_item_refs_progress,
    )

    if args.additional_item_refs_file_path:
        _load_additional_item_refs(
            base.paths.normalize(args.additional_item_refs_file_path),
            stream_ids,
            item_refs_responses,
            user_info.user_id,
        )

    item_ids = set()
    known_item_ids_in_compact_form = set()
    item_refs_total = 0
    for stream_id, item_refs in itertools.izip(stream_ids, item_refs_responses):
        if not item_refs:
            continue
        item_ids.update([item_ref.item_id for item_ref in item_refs])
        item_refs_total += len(item_refs)

        if stream_id == base.api.EXPLORE_STREAM_ID:
            base.api.not_found_items_ids_to_ignore.update([i.item_id for i in item_refs])

        stream = base.api.Stream(stream_id=stream_id, item_refs=item_refs)
        stream_file_name = base.paths.stream_id_to_file_name(stream_id) + ".json"
        stream_file_path = os.path.join(streams_directory, stream_file_name)
        with open(stream_file_path, "w") as stream_file:
            stream_file.write(json.dumps(stream.to_json()))

    item_ids = list(item_ids)
    logging.info(
        "%s unique items refs (%s total), getting item bodies:",
        "{:,}".format(len(item_ids)),
        "{:,}".format(item_refs_total),
    )

    # We have two different chunking goals:
    # - Fetch items in large-ish chunks (ideally 250), to minimize HTTP request
    #   overhead per item
    # - Write items in small-ish chunks (ideally around 10) per file, since having
    #   a file per item is too annoying to deal with from a file-system
    #   perspective. We also need the chunking into files to be deterministic, so
    #   that from an item ID we know what file to look for it in.
    # We therefore first chunk the IDs by file path, and then group those chunks
    # into ID chunks that we fetch.
    # We write the file chunks immediately after fetching to decrease the
    # in-memory working set of the script.
    item_ids_by_path = {}
    for item_id in item_ids:
        item_id_file_path = base.paths.item_id_to_file_path(items_directory, item_id)
        item_ids_by_path.setdefault(item_id_file_path, []).append(item_id)

    current_item_ids_chunk = []
    item_ids_chunks = [current_item_ids_chunk]
    for item_ids_for_file_path in item_ids_by_path.values():
        if len(current_item_ids_chunk) + len(item_ids_for_file_path) > args.item_bodies_chunk_size:
            current_item_ids_chunk = []
            item_ids_chunks.append(current_item_ids_chunk)
        current_item_ids_chunk.extend(item_ids_for_file_path)

    item_bodies_to_fetch = len(item_ids)
    fetched_item_bodies = [0]

    def report_item_bodies_progress(_, count):
        if count is None:
            return
        fetched_item_bodies[0] += count
        logging.info(
            "  Fetched %s/%s item bodies", "{:,}".format(fetched_item_bodies[0]), "{:,}".format(item_bodies_to_fetch)
        )

    base.worker.do_work(
        lambda: FetchWriteItemBodiesWorker(api, items_directory),
        item_ids_chunks,
        args.parallelism,
        report_progress=report_item_bodies_progress,
    )

    broadcast_stream_ids = [
        stream_id
        for stream_id in stream_ids
        if stream_id.startswith("user/") and stream_id.endswith("/state/com.google/broadcast")
    ]
    logging.info("Fetching comments from %d shared item streams.", len(broadcast_stream_ids))
    encoded_sharers = api.fetch_encoded_sharers()
    remaining_broadcast_stream_ids = [len(broadcast_stream_ids)]

    def report_comments_progress(_, comments_by_item_id):
        if comments_by_item_id is None:
            return
        remaining_broadcast_stream_ids[0] -= 1
        comment_count = sum((len(c) for c in comments_by_item_id.values()), 0)
        logging.info(
            "  Fetched %s comments, %s shared items streams left.",
            "{:,}".format(comment_count),
            "{:,}".format(remaining_broadcast_stream_ids[0]),
        )

    all_comments = {}
    comments_for_broadcast_streams = base.worker.do_work(
        lambda: FetchCommentsWorker(api, encoded_sharers, args.comments_chunk_size),
        broadcast_stream_ids,
        args.parallelism,
        report_progress=report_comments_progress,
    )
    total_comment_count = 0
    for comments_for_broadcast_stream in comments_for_broadcast_streams:
        if not comments_for_broadcast_stream:
            continue
        for item_id, comments in comments_for_broadcast_stream.iteritems():
            total_comment_count += len(comments)
            all_comments.setdefault(item_id, []).extend(comments)

    logging.info(
        "Writing %s comments from %s items.", "{:,}".format(total_comment_count), "{:,}".format(len(all_comments))
    )
    for item_id, comments in all_comments.items():
        item_comments_file_path = os.path.join(
            base.paths.item_id_to_file_path(comments_directory, item_id), item_id.compact_form()
        )
        base.paths.ensure_exists(os.path.dirname(item_comments_file_path))
        with open(item_comments_file_path, "w") as item_comments_file:
            item_comments_file.write(json.dumps([c.to_json() for c in comments]))

    with open(os.path.join(output_directory, "README"), "w") as readme_file:
        readme_file.write("See https://github.com/mihaip/readerisdead/" "wiki/reader_archive-Format.\n")
Ejemplo n.º 2
0
def main():
  base.log.init()
  base.atom.init()

  parser = argparse.ArgumentParser(
      description='Comprehensive archive of a Google Reader account')

  # Credentials
  parser.add_argument('--use_client_login' ,action='store_true',
                      help='Instead of OAuth, use ClientLogin for '
                            'authentication. You will be prompted for a '
                            'username and password')
  parser.add_argument('--oauth_refresh_token', default='',
                      help='A previously obtained refresh token (used to bypass '
                            'OAuth setup')
  parser.add_argument('--account', default='',
                      help='Google Account to save the archive for. Omit to '
                          'specify via standard input')
  parser.add_argument('--password', default='',
                      help='Password for the account. Omit to specify via '
                          'standard input')

  # Output options
  parser.add_argument('--output_directory', default='./',
                      help='Directory where to place archive data.')

  # Fetching options
  parser.add_argument('--stream_items_chunk_size', type=int, default=10000,
                      help='Number of items refs to request per stream items '
                           'API call (higher is more efficient)')
  parser.add_argument('--max_items_per_stream', type=int, default=0,
                      help='If non-zero, will cap the number of items that are '
                            'fetched per feed or tag')
  parser.add_argument('--item_bodies_chunk_size', type=int, default=250,
                      help='Number of items refs per request for fetching their '
                           'bodies (higher is more efficient)')
  parser.add_argument('--comments_chunk_size', type=int, default=250,
                      help='Number of items per request for fetching comments '
                           'on shared items (higher is more efficient)')
  parser.add_argument('--max_streams', type=int, default=0,
                      help='Maxmium number of streams to archive (0 for no'
                           'limit, only mean to be used for development)')
  parser.add_argument('--parallelism', type=int, default=10,
                      help='Number of requests to make in parallel.')
  parser.add_argument('--http_retry_count', type=int, default=1,
                      help='Number of retries to make in the case of HTTP '
                           'request errors.')

  # Miscellaneous.
  parser.add_argument('--additional_item_refs_file_path', default='',
                      help='Path to JSON file listing additional tag item refs '
                           'to fetch')

  args = parser.parse_args()

  output_directory = base.paths.normalize(args.output_directory)
  base.paths.ensure_exists(output_directory)
  def output_sub_directory(name):
    directory_path = os.path.join(output_directory, name)
    base.paths.ensure_exists(directory_path)
    return directory_path
  api_responses_directory = output_sub_directory('_raw_data')
  streams_directory = output_sub_directory('streams')
  data_directory = output_sub_directory('data')
  items_directory = output_sub_directory('items')
  comments_directory = output_sub_directory('comments')

  if args.use_client_login:
    authenticated_url_fetcher = base.url_fetcher.ClientLoginUrlFetcher(
        args.account, args.password)
  else:
    authenticated_url_fetcher = base.url_fetcher.OAuthUrlFetcher(
        args.oauth_refresh_token)
  api = base.api.Api(
      authenticated_url_fetcher=authenticated_url_fetcher,
      http_retry_count=args.http_retry_count,
      cache_directory=api_responses_directory)

  user_info = api.fetch_user_info()
  logging.info(
    'Created API instance for %s (%s)', user_info.user_id, user_info.email)

  logging.info('Saving preferences')
  _save_preferences(api, data_directory)

  logging.info('Gathering streams to fetch')
  stream_ids = _get_stream_ids(api, user_info.user_id, data_directory)
  if args.max_streams and len(stream_ids) > args.max_streams:
    stream_ids = stream_ids[:args.max_streams]
  logging.info('%d streams to fetch, gathering item refs:', len(stream_ids))

  item_ids, item_refs_total = _fetch_and_save_item_refs(
      stream_ids, api, args, streams_directory, user_info.user_id)
  logging.info('%s unique items refs (%s total), getting item bodies:',
      '{:,}'.format(len(item_ids)),
      '{:,}'.format(item_refs_total))

  item_ids_chunks = _chunk_item_ids(item_ids, args.item_bodies_chunk_size)

  item_bodies_to_fetch = len(item_ids)
  fetched_item_bodies = [0]
  missing_item_bodies = set()
  def report_item_bodies_progress(requested_item_ids, found_item_ids):
    if found_item_ids is None:
      missing_item_bodies.update(set(requested_item_ids).difference(
          base.api.not_found_items_ids_to_ignore))
      return
    fetched_item_bodies[0] += len(found_item_ids)
    missing_item_bodies.update(
        set(requested_item_ids).difference(set(found_item_ids)).difference(
            base.api.not_found_items_ids_to_ignore))
    logging.info('  Fetched %s/%s item bodies (%s could not be loaded)',
        '{:,}'.format(fetched_item_bodies[0]),
        '{:,}'.format(item_bodies_to_fetch),
        '{:,}'.format(len(missing_item_bodies)))
  base.worker.do_work(
      lambda: FetchWriteItemBodiesWorker(api, items_directory),
      item_ids_chunks,
      args.parallelism,
      report_progress=report_item_bodies_progress)

  if missing_item_bodies:
    logging.warn('Item bodies could not be loaded for: %s',
        ', '.join([i.compact_form() for i in missing_item_bodies]))

  broadcast_stream_ids = [
      stream_id for stream_id in stream_ids
      if stream_id.startswith('user/') and
          stream_id.endswith('/state/com.google/broadcast')
  ]
  logging.info(
      'Fetching comments from %d shared item streams.',
      len(broadcast_stream_ids))
  encoded_sharers = api.fetch_encoded_sharers()
  remaining_broadcast_stream_ids = [len(broadcast_stream_ids)]
  def report_comments_progress(_, comments_by_item_id):
    if comments_by_item_id is None:
      return
    remaining_broadcast_stream_ids[0] -= 1
    comment_count = sum((len(c) for c in comments_by_item_id.values()), 0)
    logging.info('  Fetched %s comments, %s shared items streams left.',
        '{:,}'.format(comment_count),
        '{:,}'.format(remaining_broadcast_stream_ids[0]))
  all_comments = {}
  comments_for_broadcast_streams = base.worker.do_work(
      lambda: FetchCommentsWorker(
          api, encoded_sharers, args.comments_chunk_size),
      broadcast_stream_ids,
      args.parallelism,
      report_progress=report_comments_progress)
  total_comment_count = 0
  for comments_for_broadcast_stream in comments_for_broadcast_streams:
    if not comments_for_broadcast_stream:
      continue
    for item_id, comments in comments_for_broadcast_stream.iteritems():
      total_comment_count += len(comments)
      all_comments.setdefault(item_id, []).extend(comments)

  logging.info('Writing %s comments from %s items.',
      '{:,}'.format(total_comment_count),
      '{:,}'.format(len(all_comments)))
  for item_id, comments in all_comments.items():
    item_comments_file_path = os.path.join(base.paths.item_id_to_file_path(
        comments_directory, item_id), item_id.compact_form())
    base.paths.ensure_exists(os.path.dirname(item_comments_file_path))
    with open(item_comments_file_path, 'w') as item_comments_file:
      item_comments_file.write(json.dumps([c.to_json() for c in comments]))

  with open(os.path.join(output_directory, 'README'), 'w') as readme_file:
    readme_file.write('See https://github.com/mihaip/readerisdead/'
        'wiki/reader_archive-Format.\n')
Ejemplo n.º 3
0
def main():
  base.log.init()
  base.atom.init()

  parser = argparse.ArgumentParser(
      description='Comprehensive archive of a Google Reader account')

  # Credentials
  parser.add_argument('--use_client_login' ,action='store_true',
                      help='Instead of OAuth, use ClientLogin for '
                            'authentication. You will be prompted for a '
                            'username and password')
  parser.add_argument('--oauth_refresh_token', default='',
                      help='A previously obtained refresh token (used to bypass '
                            'OAuth setup')
  parser.add_argument('--account', default='',
                      help='Google Account to save the archive for. Omit to '
                          'specify via standard input')
  parser.add_argument('--password', default='',
                      help='Password for the account. Omit to specify via '
                          'standard input')

  # Output options
  parser.add_argument('--output_directory', default='./',
                      help='Directory where to place archive data.')

  # Fetching options
  parser.add_argument('--stream_items_chunk_size', type=int, default=10000,
                      help='Number of items refs to request per stream items '
                           'API call (higher is more efficient)')
  parser.add_argument('--max_items_per_stream', type=int, default=0,
                      help='If non-zero, will cap the number of items that are '
                            'fetched per feed or tag')
  parser.add_argument('--item_bodies_chunk_size', type=int, default=250,
                      help='Number of items refs per request for fetching their '
                           'bodies (higher is more efficient)')
  parser.add_argument('--comments_chunk_size', type=int, default=250,
                      help='Number of items per request for fetching comments '
                           'on shared items (higher is more efficient)')
  parser.add_argument('--max_streams', type=int, default=0,
                      help='Maxmium number of streams to archive (0 for no'
                           'limit, only mean to be used for development)')
  parser.add_argument('--parallelism', type=int, default=10,
                      help='Number of requests to make in parallel.')
  parser.add_argument('--http_retry_count', type=int, default=1,
                      help='Number of retries to make in the case of HTTP '
                           'request errors.')

  # Miscellaneous.
  parser.add_argument('--additional_item_refs_file_path', default='',
                      help='Path to JSON file listing additional tag item refs '
                           'to fetch')

  args = parser.parse_args()

  output_directory = base.paths.normalize(args.output_directory)
  base.paths.ensure_exists(output_directory)
  def output_sub_directory(name):
    directory_path = os.path.join(output_directory, name)
    base.paths.ensure_exists(directory_path)
    return directory_path
  api_responses_directory = output_sub_directory('_raw_data')
  streams_directory = output_sub_directory('streams')
  data_directory = output_sub_directory('data')
  items_directory = output_sub_directory('items')
  comments_directory = output_sub_directory('comments')

  if args.use_client_login:
    authenticated_url_fetcher = base.url_fetcher.ClientLoginUrlFetcher(
        args.account, args.password)
  else:
    authenticated_url_fetcher = base.url_fetcher.OAuthUrlFetcher(
        args.oauth_refresh_token)
  api = base.api.Api(
      authenticated_url_fetcher=authenticated_url_fetcher,
      http_retry_count=args.http_retry_count,
      cache_directory=api_responses_directory)

  user_info = api.fetch_user_info()
  logging.info(
    'Created API instance for %s (%s)', user_info.user_id, user_info.email)

  logging.info('Saving preferences')
  _save_preferences(api, data_directory)

  logging.info('Gathering streams to fetch')
  stream_ids = _get_stream_ids(api, user_info.user_id, data_directory)
  if args.max_streams and len(stream_ids) > args.max_streams:
    stream_ids = stream_ids[:args.max_streams]
  logging.info('%d streams to fetch, gathering item refs:', len(stream_ids))

  fetched_stream_ids = [0]
  def report_item_refs_progress(stream_id, item_refs):
    if item_refs is None:
      logging.error('  Could not load item refs from %s', stream_id)
      return
    fetched_stream_ids[0] += 1
    logging.info('  Loaded %s item refs from %s, %d streams left.',
        '{:,}'.format(len(item_refs)),
        stream_id,
        len(stream_ids) - fetched_stream_ids[0])
  item_refs_responses = base.worker.do_work(
      lambda: FetchItemRefsWorker(
          api, args.stream_items_chunk_size, args.max_items_per_stream),
      stream_ids,
      args.parallelism,
      report_progress=report_item_refs_progress)

  if args.additional_item_refs_file_path:
    _load_additional_item_refs(
        base.paths.normalize(args.additional_item_refs_file_path),
        stream_ids,
        item_refs_responses,
        user_info.user_id)

  item_ids = set()
  known_item_ids_in_compact_form = set()
  item_refs_total = 0
  for stream_id, item_refs in itertools.izip(stream_ids, item_refs_responses):
    if not item_refs:
      continue
    item_ids.update([item_ref.item_id for item_ref in item_refs])
    item_refs_total += len(item_refs)

    if stream_id == base.api.EXPLORE_STREAM_ID:
      base.api.not_found_items_ids_to_ignore.update(
          [i.item_id for i in item_refs])

    stream = base.api.Stream(stream_id=stream_id, item_refs=item_refs)
    stream_file_name = base.paths.stream_id_to_file_name(stream_id) + '.json'
    stream_file_path = os.path.join(streams_directory, stream_file_name)
    with open(stream_file_path, 'w') as stream_file:
      stream_file.write(json.dumps(stream.to_json()))

  item_ids = list(item_ids)
  logging.info('%s unique items refs (%s total), getting item bodies:',
      '{:,}'.format(len(item_ids)),
      '{:,}'.format(item_refs_total))

  # We have two different chunking goals:
  # - Fetch items in large-ish chunks (ideally 250), to minimize HTTP request
  #   overhead per item
  # - Write items in small-ish chunks (ideally around 10) per file, since having
  #   a file per item is too annoying to deal with from a file-system
  #   perspective. We also need the chunking into files to be deterministic, so
  #   that from an item ID we know what file to look for it in.
  # We therefore first chunk the IDs by file path, and then group those chunks
  # into ID chunks that we fetch.
  # We write the file chunks immediately after fetching to decrease the
  # in-memory working set of the script.
  item_ids_by_path = {}
  for item_id in item_ids:
    item_id_file_path = base.paths.item_id_to_file_path(
        items_directory, item_id)
    item_ids_by_path.setdefault(item_id_file_path, []).append(item_id)

  current_item_ids_chunk = []
  item_ids_chunks = [current_item_ids_chunk]
  for item_ids_for_file_path in item_ids_by_path.values():
    if len(current_item_ids_chunk) + len(item_ids_for_file_path) > \
          args.item_bodies_chunk_size:
      current_item_ids_chunk = []
      item_ids_chunks.append(current_item_ids_chunk)
    current_item_ids_chunk.extend(item_ids_for_file_path)

  item_bodies_to_fetch = len(item_ids)
  fetched_item_bodies = [0]
  missing_item_bodies = set()
  def report_item_bodies_progress(requested_item_ids, found_item_ids):
    if found_item_ids is None:
      missing_item_bodies.update(requested_item_ids.difference(
          base.api.not_found_items_ids_to_ignore))
      return
    fetched_item_bodies[0] += len(found_item_ids)
    missing_item_bodies.update(
        set(requested_item_ids).difference(set(found_item_ids)).difference(
            base.api.not_found_items_ids_to_ignore))
    logging.info('  Fetched %s/%s item bodies (%s could not be loaded)',
        '{:,}'.format(fetched_item_bodies[0]),
        '{:,}'.format(item_bodies_to_fetch),
        '{:,}'.format(len(missing_item_bodies)))
  base.worker.do_work(
      lambda: FetchWriteItemBodiesWorker(api, items_directory),
      item_ids_chunks,
      args.parallelism,
      report_progress=report_item_bodies_progress)

  if missing_item_bodies:
    logging.warn('Item bodies could not be loaded for: %s',
        ', '.join([i.compact_form() for i in missing_item_bodies]))

  broadcast_stream_ids = [
      stream_id for stream_id in stream_ids
      if stream_id.startswith('user/') and
          stream_id.endswith('/state/com.google/broadcast')
  ]
  logging.info(
      'Fetching comments from %d shared item streams.',
      len(broadcast_stream_ids))
  encoded_sharers = api.fetch_encoded_sharers()
  remaining_broadcast_stream_ids = [len(broadcast_stream_ids)]
  def report_comments_progress(_, comments_by_item_id):
    if comments_by_item_id is None:
      return
    remaining_broadcast_stream_ids[0] -= 1
    comment_count = sum((len(c) for c in comments_by_item_id.values()), 0)
    logging.info('  Fetched %s comments, %s shared items streams left.',
        '{:,}'.format(comment_count),
        '{:,}'.format(remaining_broadcast_stream_ids[0]))
  all_comments = {}
  comments_for_broadcast_streams = base.worker.do_work(
      lambda: FetchCommentsWorker(
          api, encoded_sharers, args.comments_chunk_size),
      broadcast_stream_ids,
      args.parallelism,
      report_progress=report_comments_progress)
  total_comment_count = 0
  for comments_for_broadcast_stream in comments_for_broadcast_streams:
    if not comments_for_broadcast_stream:
      continue
    for item_id, comments in comments_for_broadcast_stream.iteritems():
      total_comment_count += len(comments)
      all_comments.setdefault(item_id, []).extend(comments)

  logging.info('Writing %s comments from %s items.',
      '{:,}'.format(total_comment_count),
      '{:,}'.format(len(all_comments)))
  for item_id, comments in all_comments.items():
    item_comments_file_path = os.path.join(base.paths.item_id_to_file_path(
        comments_directory, item_id), item_id.compact_form())
    base.paths.ensure_exists(os.path.dirname(item_comments_file_path))
    with open(item_comments_file_path, 'w') as item_comments_file:
      item_comments_file.write(json.dumps([c.to_json() for c in comments]))

  with open(os.path.join(output_directory, 'README'), 'w') as readme_file:
    readme_file.write('See https://github.com/mihaip/readerisdead/'
        'wiki/reader_archive-Format.\n')
Ejemplo n.º 4
0
def main():
    base.log.init()
    base.atom.init()

    parser = argparse.ArgumentParser(
        description='Comprehensive archive of a Google Reader account')

    # Credentials
    parser.add_argument('--use_client_login',
                        action='store_true',
                        help='Instead of OAuth, use ClientLogin for '
                        'authentication. You will be prompted for a '
                        'username and password')
    parser.add_argument(
        '--oauth_refresh_token',
        default='',
        help='A previously obtained refresh token (used to bypass '
        'OAuth setup')
    parser.add_argument('--account',
                        default='',
                        help='Google Account to save the archive for. Omit to '
                        'specify via standard input')
    parser.add_argument('--password',
                        default='',
                        help='Password for the account. Omit to specify via '
                        'standard input')

    # Output options
    parser.add_argument('--output_directory',
                        default='./',
                        help='Directory where to place archive data.')

    # Fetching options
    parser.add_argument(
        '--stream_items_chunk_size',
        type=int,
        default=10000,
        help='Number of items refs to request per stream items '
        'API call (higher is more efficient)')
    parser.add_argument(
        '--max_items_per_stream',
        type=int,
        default=0,
        help='If non-zero, will cap the number of items that are '
        'fetched per feed or tag')
    parser.add_argument(
        '--item_bodies_chunk_size',
        type=int,
        default=250,
        help='Number of items refs per request for fetching their '
        'bodies (higher is more efficient)')
    parser.add_argument(
        '--comments_chunk_size',
        type=int,
        default=250,
        help='Number of items per request for fetching comments '
        'on shared items (higher is more efficient)')
    parser.add_argument('--max_streams',
                        type=int,
                        default=0,
                        help='Maxmium number of streams to archive (0 for no'
                        'limit, only mean to be used for development)')
    parser.add_argument('--parallelism',
                        type=int,
                        default=10,
                        help='Number of requests to make in parallel.')
    parser.add_argument('--http_retry_count',
                        type=int,
                        default=1,
                        help='Number of retries to make in the case of HTTP '
                        'request errors.')

    # Miscellaneous.
    parser.add_argument(
        '--additional_item_refs_file_path',
        default='',
        help='Path to JSON file listing additional tag item refs '
        'to fetch')

    args = parser.parse_args()

    output_directory = base.paths.normalize(args.output_directory)
    base.paths.ensure_exists(output_directory)

    def output_sub_directory(name):
        directory_path = os.path.join(output_directory, name)
        base.paths.ensure_exists(directory_path)
        return directory_path

    api_responses_directory = output_sub_directory('_raw_data')
    streams_directory = output_sub_directory('streams')
    data_directory = output_sub_directory('data')
    items_directory = output_sub_directory('items')
    comments_directory = output_sub_directory('comments')

    if args.use_client_login:
        authenticated_url_fetcher = base.url_fetcher.ClientLoginUrlFetcher(
            args.account, args.password)
    else:
        authenticated_url_fetcher = base.url_fetcher.OAuthUrlFetcher(
            args.oauth_refresh_token)
    api = base.api.Api(authenticated_url_fetcher=authenticated_url_fetcher,
                       http_retry_count=args.http_retry_count,
                       cache_directory=api_responses_directory)

    user_info = api.fetch_user_info()
    logging.info('Created API instance for %s (%s)', user_info.user_id,
                 user_info.email)

    logging.info('Saving preferences')
    _save_preferences(api, data_directory)

    logging.info('Gathering streams to fetch')
    stream_ids = _get_stream_ids(api, user_info.user_id, data_directory)
    if args.max_streams and len(stream_ids) > args.max_streams:
        stream_ids = stream_ids[:args.max_streams]
    logging.info('%d streams to fetch, gathering item refs:', len(stream_ids))

    item_ids, item_refs_total = _fetch_and_save_item_refs(
        stream_ids, api, args, streams_directory, user_info.user_id)
    logging.info('%s unique items refs (%s total), grouping by chunk.',
                 '{:,}'.format(len(item_ids)), '{:,}'.format(item_refs_total))

    logging.info('Grouped item refs, getting item bodies:')

    item_ids_chunks = _chunk_item_ids(item_ids, args.item_bodies_chunk_size)

    item_bodies_to_fetch = len(item_ids)
    fetched_item_bodies = [0]
    missing_item_bodies = set()

    def report_item_bodies_progress(requested_item_ids, found_item_ids):
        if found_item_ids is None:
            missing_item_bodies.update(
                set(requested_item_ids).difference(
                    base.api.not_found_items_ids_to_ignore))
            return
        fetched_item_bodies[0] += len(found_item_ids)
        missing_item_bodies.update(
            set(requested_item_ids).difference(set(found_item_ids)).difference(
                base.api.not_found_items_ids_to_ignore))
        logging.info('  Fetched %s/%s item bodies (%s could not be loaded)',
                     '{:,}'.format(fetched_item_bodies[0]),
                     '{:,}'.format(item_bodies_to_fetch),
                     '{:,}'.format(len(missing_item_bodies)))

    base.worker.do_work(
        lambda: FetchWriteItemBodiesWorker(api, items_directory),
        item_ids_chunks,
        args.parallelism,
        report_progress=report_item_bodies_progress)

    if missing_item_bodies:
        logging.warn(
            'Item bodies could not be loaded for: %s',
            ', '.join([i.compact_form() for i in missing_item_bodies]))

    broadcast_stream_ids = [
        stream_id for stream_id in stream_ids if stream_id.startswith('user/')
        and stream_id.endswith('/state/com.google/broadcast')
    ]
    logging.info('Fetching comments from %d shared item streams.',
                 len(broadcast_stream_ids))
    encoded_sharers = api.fetch_encoded_sharers()
    remaining_broadcast_stream_ids = [len(broadcast_stream_ids)]

    def report_comments_progress(_, comments_by_item_id):
        if comments_by_item_id is None:
            return
        remaining_broadcast_stream_ids[0] -= 1
        comment_count = sum((len(c) for c in comments_by_item_id.values()), 0)
        logging.info('  Fetched %s comments, %s shared items streams left.',
                     '{:,}'.format(comment_count),
                     '{:,}'.format(remaining_broadcast_stream_ids[0]))

    all_comments = {}
    comments_for_broadcast_streams = base.worker.do_work(
        lambda: FetchCommentsWorker(api, encoded_sharers, args.
                                    comments_chunk_size),
        broadcast_stream_ids,
        args.parallelism,
        report_progress=report_comments_progress)
    total_comment_count = 0
    for comments_for_broadcast_stream in comments_for_broadcast_streams:
        if not comments_for_broadcast_stream:
            continue
        for item_id, comments in comments_for_broadcast_stream.iteritems():
            total_comment_count += len(comments)
            all_comments.setdefault(item_id, []).extend(comments)

    logging.info('Writing %s comments from %s items.',
                 '{:,}'.format(total_comment_count),
                 '{:,}'.format(len(all_comments)))
    for item_id, comments in all_comments.items():
        item_comments_file_path = os.path.join(
            base.paths.item_id_to_file_path(comments_directory, item_id),
            item_id.compact_form())
        base.paths.ensure_exists(os.path.dirname(item_comments_file_path))
        with open(item_comments_file_path, 'w') as item_comments_file:
            item_comments_file.write(
                json.dumps([c.to_json() for c in comments]))

    with open(os.path.join(output_directory, 'README'), 'w') as readme_file:
        readme_file.write('See https://github.com/mihaip/readerisdead/'
                          'wiki/reader_archive-Format.\n')