def test_warc_enabled(): # Note that this does not use the responses mocking framework, as it conflicts with the warc captures. # This makes a real request to Yahoo, so might fail. url = 'https://groups.yahoo.com/api/v1/groups/test/' yga = YahooGroupsAPI('test') writer = BufferWARCWriter(gzip=False) yga.set_warc_writer(writer) yga.get_json('HackGroupInfo') expected = [(url, 'response'), (url, 'request')] actual = [(record.rec_headers['WARC-Target-URI'], record.rec_type) for record in ArchiveIterator(writer.get_stream())] assert expected == actual
log_file_handler.setFormatter(log_formatter) root_logger.addHandler(log_file_handler) if args.warc: try: from warcio import WARCWriter except ImportError: logging.error( 'WARC output requires the warcio package to be installed.') exit(1) fhwarc = open('data.warc.gz', 'ab') warc_writer = WARCWriter(fhwarc) warcmeta = warc_writer.create_warcinfo_record( fhwarc.name, WARC_META_PARAMS) warc_writer.write_record(warcmeta) yga.set_warc_writer(warc_writer) if args.email: with Mkchdir('email'): archive_email(yga, message_subset=args.ids, start=args.start, stop=args.stop) if args.files: with Mkchdir('files'): archive_files(yga) if args.photos: with Mkchdir('photos'): archive_photos(yga) if args.database: with Mkchdir('databases'):
def main(): args = parse_arguments() # Setup logging root_logger = logging.getLogger() root_logger.setLevel(logging.DEBUG) log_format = { 'fmt': '%(asctime)s %(levelname)s %(name)s %(message)s', 'datefmt': '%Y-%m-%d %H:%M:%S.%f %Z' } log_formatter = CustomFormatter(**log_format) if args.verbose: log_level = logging.DEBUG elif args.quiet: log_level = logging.ERROR else: log_level = logging.INFO if args.colour: try: import coloredlogs except ImportError as e: sys.exit( "Error: Coloured logging output requires the 'coloredlogs' package to be installed." ) coloredlogs.install(level=log_level, **log_format) else: log_stdout_handler = logging.StreamHandler(sys.stdout) log_stdout_handler.setLevel(log_level) log_stdout_handler.setFormatter(log_formatter) root_logger.addHandler(log_stdout_handler) cookie_jar = init_cookie_jar(args.cookie_file, args.cookie_t, args.cookie_y, args.cookie_e) headers = {} if args.user_agent: headers['User-Agent'] = args.user_agent yga = YahooGroupsAPI(args.group, cookie_jar, headers, min_delay=args.delay) # Default to all unique content. This includes topics and raw email, # but not the full email download since that would duplicate html emails we get through topics. if not (args.email or args.files or args.photos or args.database or args.links or args.calendar or args.about or args.polls or args.attachments or args.members or args.topics or args.raw): args.files = args.photos = args.database = args.links = args.calendar = args.about = \ args.polls = args.attachments = args.members = args.topics = args.raw = True with Mkchdir(args.group, sanitize=False): log_file_handler = logging.FileHandler('archive.log', 'a', 'utf-8') log_file_handler.setFormatter(log_formatter) root_logger.addHandler(log_file_handler) if args.warc: try: from warcio import WARCWriter fhwarc = open('data.warc.gz', 'ab') warc_writer = WARCWriter(fhwarc) warcmeta = warc_writer.create_warcinfo_record( fhwarc.name, WARC_META_PARAMS) warc_writer.write_record(warcmeta) yga.set_warc_writer(warc_writer) except ImportError: logging.error( 'WARC output requires the warcio package to be installed.') exit(1) if args.overwrite: hacky_vars['file'] = True if args.email: with Mkchdir('email'): archive_email(yga, message_subset=args.ids, start=args.start, stop=args.stop, noAttachments=args.noattachments) if args.files: with Mkchdir('files'): archive_files(yga) if args.photos: with Mkchdir('photos'): archive_photos(yga) if args.topics: with Mkchdir('topics'): archive_topics(yga, noAttachments=args.noattachments) if args.raw: with Mkchdir('email'): archive_email(yga, message_subset=args.ids, start=args.start, stop=args.stop, skipHTML=True) if args.database: with Mkchdir('databases'): archive_db(yga) if args.links: with Mkchdir('links'): archive_links(yga) if args.about: with Mkchdir('about'): archive_about(yga) if args.polls: with Mkchdir('polls'): archive_polls(yga) if args.attachments: with Mkchdir('attachments'): archive_attachments(yga) if args.members: with Mkchdir('members'): archive_members(yga) if args.calendar: with Mkchdir('calendar'): archive_calendar(yga) if args.warc: fhwarc.close()