Example #1
0
def test_warc_enabled():
    # Note that this does not use the responses mocking framework, as it conflicts with the warc captures.
    # This makes a real request to Yahoo, so might fail.
    url = 'https://groups.yahoo.com/api/v1/groups/test/'

    yga = YahooGroupsAPI('test')
    writer = BufferWARCWriter(gzip=False)
    yga.set_warc_writer(writer)
    yga.get_json('HackGroupInfo')

    expected = [(url, 'response'), (url, 'request')]
    actual = [(record.rec_headers['WARC-Target-URI'], record.rec_type)
              for record in ArchiveIterator(writer.get_stream())]
    assert expected == actual
Example #2
0
        log_file_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_file_handler)

        if args.warc:
            try:
                from warcio import WARCWriter
            except ImportError:
                logging.error(
                    'WARC output requires the warcio package to be installed.')
                exit(1)
            fhwarc = open('data.warc.gz', 'ab')
            warc_writer = WARCWriter(fhwarc)
            warcmeta = warc_writer.create_warcinfo_record(
                fhwarc.name, WARC_META_PARAMS)
            warc_writer.write_record(warcmeta)
            yga.set_warc_writer(warc_writer)

        if args.email:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop)
        if args.files:
            with Mkchdir('files'):
                archive_files(yga)
        if args.photos:
            with Mkchdir('photos'):
                archive_photos(yga)
        if args.database:
            with Mkchdir('databases'):
Example #3
0
def main():
    args = parse_arguments()

    # Setup logging
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)

    log_format = {
        'fmt': '%(asctime)s %(levelname)s %(name)s %(message)s',
        'datefmt': '%Y-%m-%d %H:%M:%S.%f %Z'
    }
    log_formatter = CustomFormatter(**log_format)

    if args.verbose:
        log_level = logging.DEBUG
    elif args.quiet:
        log_level = logging.ERROR
    else:
        log_level = logging.INFO
    if args.colour:
        try:
            import coloredlogs
        except ImportError as e:
            sys.exit(
                "Error: Coloured logging output requires the 'coloredlogs' package to be installed."
            )
        coloredlogs.install(level=log_level, **log_format)
    else:
        log_stdout_handler = logging.StreamHandler(sys.stdout)
        log_stdout_handler.setLevel(log_level)
        log_stdout_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_stdout_handler)

    cookie_jar = init_cookie_jar(args.cookie_file, args.cookie_t,
                                 args.cookie_y, args.cookie_e)

    headers = {}
    if args.user_agent:
        headers['User-Agent'] = args.user_agent

    yga = YahooGroupsAPI(args.group, cookie_jar, headers, min_delay=args.delay)

    # Default to all unique content. This includes topics and raw email,
    # but not the full email download since that would duplicate html emails we get through topics.
    if not (args.email or args.files or args.photos or args.database
            or args.links or args.calendar or args.about or args.polls
            or args.attachments or args.members or args.topics or args.raw):
        args.files = args.photos = args.database = args.links = args.calendar = args.about = \
            args.polls = args.attachments = args.members = args.topics = args.raw = True

    with Mkchdir(args.group, sanitize=False):
        log_file_handler = logging.FileHandler('archive.log', 'a', 'utf-8')
        log_file_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_file_handler)

        if args.warc:
            try:

                from warcio import WARCWriter
                fhwarc = open('data.warc.gz', 'ab')
                warc_writer = WARCWriter(fhwarc)
                warcmeta = warc_writer.create_warcinfo_record(
                    fhwarc.name, WARC_META_PARAMS)
                warc_writer.write_record(warcmeta)
                yga.set_warc_writer(warc_writer)
            except ImportError:
                logging.error(
                    'WARC output requires the warcio package to be installed.')
                exit(1)
        if args.overwrite:
            hacky_vars['file'] = True
        if args.email:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop,
                              noAttachments=args.noattachments)
        if args.files:
            with Mkchdir('files'):
                archive_files(yga)
        if args.photos:
            with Mkchdir('photos'):
                archive_photos(yga)
        if args.topics:
            with Mkchdir('topics'):
                archive_topics(yga, noAttachments=args.noattachments)
        if args.raw:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop,
                              skipHTML=True)
        if args.database:
            with Mkchdir('databases'):
                archive_db(yga)
        if args.links:
            with Mkchdir('links'):
                archive_links(yga)
        if args.about:
            with Mkchdir('about'):
                archive_about(yga)
        if args.polls:
            with Mkchdir('polls'):
                archive_polls(yga)
        if args.attachments:
            with Mkchdir('attachments'):
                archive_attachments(yga)
        if args.members:
            with Mkchdir('members'):
                archive_members(yga)
        if args.calendar:
            with Mkchdir('calendar'):
                archive_calendar(yga)
        if args.warc:
            fhwarc.close()