Exemple #1
0
class CDXToolkitWARCWriter:
    def __init__(self, prefix, subprefix, info, size=1000000000, gzip=True, warc_version=None):
        self.prefix = prefix
        self.subprefix = subprefix
        self.info = info
        self.size = size
        self.gzip = gzip
        self.warc_version = warc_version
        self.segment = 0
        self.writer = None

    def write_record(self, *args, **kwargs):
        if self.writer is None:
            if self.warc_version is None:
                # opportunity to intuit warc version here
                self.warc_version = '1.0'
            if self.warc_version != '1.0':
                LOGGER.error('WARC versions other than 1.0 are not correctly supported yet')
                # ...because fake_wb_warc always generates 1.0
            # should we also check the warcinfo record to make sure it's got a matching warc_version inside?
            self._start_new_warc()

        self.writer.write_record(*args, **kwargs)

        fsize = os.fstat(self.fd.fileno()).st_size
        if fsize > self.size:
            self.fd.close()
            self.writer = None
            self.segment += 1

    def _unique_warc_filename(self):
        while True:
            name = self.prefix + '-'
            if self.subprefix is not None:
                name += self.subprefix + '-'
            name += '{:06d}'.format(self.segment) + '.extracted.warc'
            if self.gzip:
                name += '.gz'
            if os.path.exists(name):
                self.segment += 1
            else:
                break
        return name

    def _start_new_warc(self):
        self.filename = self._unique_warc_filename()
        self.fd = open(self.filename, 'wb')
        LOGGER.info('opening new warc file %s', self.filename)
        self.writer = WARCWriter(self.fd, gzip=self.gzip, warc_version=self.warc_version)
        warcinfo = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(warcinfo)
Exemple #2
0
class CDXToolkitWARCWriter:
    def __init__(self, prefix, subprefix, info, size=1000000000, gzip=True):
        self.prefix = prefix
        self.subprefix = subprefix
        self.info = info
        self.size = size
        self.gzip = gzip
        self.segment = 0
        self.writer = None

    def write_record(self, *args, **kwargs):
        if self.writer is None:
            self._start_new_warc()

        self.writer.write_record(*args, **kwargs)

        fsize = os.fstat(self.fd.fileno()).st_size
        if fsize > self.size:
            self.fd.close()
            self.writer = None
            self.segment += 1

    def _unique_warc_filename(self):
        while True:
            name = self.prefix + '-'
            if self.subprefix is not None:
                name += self.subprefix + '-'
            name += '{:06d}'.format(self.segment) + '.extracted.warc'
            if self.gzip:
                name += '.gz'
            if os.path.exists(name):
                self.segment += 1
            else:
                break
        return name

    def _start_new_warc(self):
        self.filename = self._unique_warc_filename()
        self.fd = open(self.filename, 'wb')
        LOGGER.info('opening new warc file %s', self.filename)
        self.writer = WARCWriter(self.fd, gzip=self.gzip)
        warcinfo = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(warcinfo)
Exemple #3
0
    with Mkchdir(args.group, sanitize=False):
        log_file_handler = logging.FileHandler('archive.log')
        log_file_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_file_handler)

        if args.warc:
            try:
                from warcio import WARCWriter
            except ImportError:
                logging.error(
                    'WARC output requires the warcio package to be installed.')
                exit(1)
            fhwarc = open('data.warc.gz', 'ab')
            warc_writer = WARCWriter(fhwarc)
            warcmeta = warc_writer.create_warcinfo_record(
                fhwarc.name, WARC_META_PARAMS)
            warc_writer.write_record(warcmeta)
            yga.set_warc_writer(warc_writer)

        if args.email:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop)
        if args.files:
            with Mkchdir('files'):
                archive_files(yga)
        if args.photos:
            with Mkchdir('photos'):
                archive_photos(yga)
Exemple #4
0
def main():
    args = parse_arguments()

    # Setup logging
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)

    log_format = {
        'fmt': '%(asctime)s %(levelname)s %(name)s %(message)s',
        'datefmt': '%Y-%m-%d %H:%M:%S.%f %Z'
    }
    log_formatter = CustomFormatter(**log_format)

    if args.verbose:
        log_level = logging.DEBUG
    elif args.quiet:
        log_level = logging.ERROR
    else:
        log_level = logging.INFO
    if args.colour:
        try:
            import coloredlogs
        except ImportError as e:
            sys.exit(
                "Error: Coloured logging output requires the 'coloredlogs' package to be installed."
            )
        coloredlogs.install(level=log_level, **log_format)
    else:
        log_stdout_handler = logging.StreamHandler(sys.stdout)
        log_stdout_handler.setLevel(log_level)
        log_stdout_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_stdout_handler)

    cookie_jar = init_cookie_jar(args.cookie_file, args.cookie_t,
                                 args.cookie_y, args.cookie_e)

    headers = {}
    if args.user_agent:
        headers['User-Agent'] = args.user_agent

    yga = YahooGroupsAPI(args.group, cookie_jar, headers, min_delay=args.delay)

    # Default to all unique content. This includes topics and raw email,
    # but not the full email download since that would duplicate html emails we get through topics.
    if not (args.email or args.files or args.photos or args.database
            or args.links or args.calendar or args.about or args.polls
            or args.attachments or args.members or args.topics or args.raw):
        args.files = args.photos = args.database = args.links = args.calendar = args.about = \
            args.polls = args.attachments = args.members = args.topics = args.raw = True

    with Mkchdir(args.group, sanitize=False):
        log_file_handler = logging.FileHandler('archive.log', 'a', 'utf-8')
        log_file_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_file_handler)

        if args.warc:
            try:

                from warcio import WARCWriter
                fhwarc = open('data.warc.gz', 'ab')
                warc_writer = WARCWriter(fhwarc)
                warcmeta = warc_writer.create_warcinfo_record(
                    fhwarc.name, WARC_META_PARAMS)
                warc_writer.write_record(warcmeta)
                yga.set_warc_writer(warc_writer)
            except ImportError:
                logging.error(
                    'WARC output requires the warcio package to be installed.')
                exit(1)
        if args.overwrite:
            hacky_vars['file'] = True
        if args.email:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop,
                              noAttachments=args.noattachments)
        if args.files:
            with Mkchdir('files'):
                archive_files(yga)
        if args.photos:
            with Mkchdir('photos'):
                archive_photos(yga)
        if args.topics:
            with Mkchdir('topics'):
                archive_topics(yga, noAttachments=args.noattachments)
        if args.raw:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop,
                              skipHTML=True)
        if args.database:
            with Mkchdir('databases'):
                archive_db(yga)
        if args.links:
            with Mkchdir('links'):
                archive_links(yga)
        if args.about:
            with Mkchdir('about'):
                archive_about(yga)
        if args.polls:
            with Mkchdir('polls'):
                archive_polls(yga)
        if args.attachments:
            with Mkchdir('attachments'):
                archive_attachments(yga)
        if args.members:
            with Mkchdir('members'):
                archive_members(yga)
        if args.calendar:
            with Mkchdir('calendar'):
                archive_calendar(yga)
        if args.warc:
            fhwarc.close()