Exemple #1
0
 def _start_new_warc(self):
     self.filename = self._unique_warc_filename()
     self.fd = open(self.filename, 'wb')
     LOGGER.info('opening new warc file %s', self.filename)
     self.writer = WARCWriter(self.fd, gzip=self.gzip, warc_version=self.warc_version)
     warcinfo = self.writer.create_warcinfo_record(self.filename, self.info)
     self.writer.write_record(warcinfo)
Exemple #2
0
 def __init__(self, name, default_delay=3600, n_thread=None):
     file = open(f"{name}.warc.gz", "wb")
     self.writer = WARCWriter(file)
     self.lock = RLock()
     self.scheduler = scheduler(time.time)
     self.executor = ThreadPoolExecutor(n_thread)
     self.default_delay = default_delay
     self.delay_adjusters = {}  # Constant delay
     self.last = {}
Exemple #3
0
class CDXToolkitWARCWriter:
    def __init__(self, prefix, subprefix, info, size=1000000000, gzip=True, warc_version=None):
        self.prefix = prefix
        self.subprefix = subprefix
        self.info = info
        self.size = size
        self.gzip = gzip
        self.warc_version = warc_version
        self.segment = 0
        self.writer = None

    def write_record(self, *args, **kwargs):
        if self.writer is None:
            if self.warc_version is None:
                # opportunity to intuit warc version here
                self.warc_version = '1.0'
            if self.warc_version != '1.0':
                LOGGER.error('WARC versions other than 1.0 are not correctly supported yet')
                # ...because fake_wb_warc always generates 1.0
            # should we also check the warcinfo record to make sure it's got a matching warc_version inside?
            self._start_new_warc()

        self.writer.write_record(*args, **kwargs)

        fsize = os.fstat(self.fd.fileno()).st_size
        if fsize > self.size:
            self.fd.close()
            self.writer = None
            self.segment += 1

    def _unique_warc_filename(self):
        while True:
            name = self.prefix + '-'
            if self.subprefix is not None:
                name += self.subprefix + '-'
            name += '{:06d}'.format(self.segment) + '.extracted.warc'
            if self.gzip:
                name += '.gz'
            if os.path.exists(name):
                self.segment += 1
            else:
                break
        return name

    def _start_new_warc(self):
        self.filename = self._unique_warc_filename()
        self.fd = open(self.filename, 'wb')
        LOGGER.info('opening new warc file %s', self.filename)
        self.writer = WARCWriter(self.fd, gzip=self.gzip, warc_version=self.warc_version)
        warcinfo = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(warcinfo)
Exemple #4
0
    def process_item(self, item, spider):
        writer = WARCWriter(self.output, gzip=True)
        headers_list = item['headers']

        http_headers = StatusAndHeaders('200 OK',
                                        headers_list,
                                        protocol='HTTP/1.0')

        record = writer.create_warc_record(item['url'],
                                           'response',
                                           payload=BytesIO(item['content']),
                                           http_headers=http_headers)

        writer.write_record(record)
Exemple #5
0
def construct_warcio_record(url, warcheader, httpheader, content_bytes):
    # payload will be parsed for http headers
    payload = httpheader.rstrip(b'\r\n') + b'\r\n\r\n' + content_bytes

    warc_headers_dict = {}
    if warcheader:
        for header in warcheader.split(b'\r\n')[1:]:  # skip the initial WARC/1 line
            k, v = header.split(b':', 1)
            warc_headers_dict[k] = v.strip()

    writer = WARCWriter(None)
    return writer.create_warc_record(url, 'response',
                                     payload=BytesIO(payload),
                                     warc_headers_dict=warc_headers_dict)
Exemple #6
0
def filter_warc_stream(ids, warc_in, warc_out):
    writer = WARCWriter(warc_out, gzip=True)

    output, total, errors = 0, 0, 0
    for record in ArchiveIterator(warc_in):
        id_ = get_record_id(record)
        if id_ in ids:
            output += 1
            try:
                writer.write_record(record)
            except Exception as e:
                logging.error(f'failed to write record: {e}')
                errors += 1
        total += 1
        if total % 10000 == 0:
            logging.info(f'processed {total} records, output {output}, '
                         f'{errors} errors')
    print(f'Done, processed {total} records, output {output}, '
          f'{errors} errors')
Exemple #7
0
class CDXToolkitWARCWriter:
    def __init__(self, prefix, subprefix, info, size=1000000000, gzip=True):
        self.prefix = prefix
        self.subprefix = subprefix
        self.info = info
        self.size = size
        self.gzip = gzip
        self.segment = 0
        self.writer = None

    def write_record(self, *args, **kwargs):
        if self.writer is None:
            self._start_new_warc()

        self.writer.write_record(*args, **kwargs)

        fsize = os.fstat(self.fd.fileno()).st_size
        if fsize > self.size:
            self.fd.close()
            self.writer = None
            self.segment += 1

    def _unique_warc_filename(self):
        while True:
            name = self.prefix + '-'
            if self.subprefix is not None:
                name += self.subprefix + '-'
            name += '{:06d}'.format(self.segment) + '.extracted.warc'
            if self.gzip:
                name += '.gz'
            if os.path.exists(name):
                self.segment += 1
            else:
                break
        return name

    def _start_new_warc(self):
        self.filename = self._unique_warc_filename()
        self.fd = open(self.filename, 'wb')
        LOGGER.info('opening new warc file %s', self.filename)
        self.writer = WARCWriter(self.fd, gzip=self.gzip)
        warcinfo = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(warcinfo)
Exemple #8
0
def fake_wb_warc(url, wb_url, resp, capture):
    '''
    Given a playback from a wayback, fake up a warc response record
    '''
    status_code = resp.status_code
    status_reason = resp.reason

    if str(status_code) != capture['status']:
        url = capture['url']
        timestamp = capture['timestamp']
        if status_code == 200 and capture['status'] == '-':
            LOGGER.warning('revisit record vivified by wayback for %s %s',
                           url, timestamp)
        elif status_code == 200 and capture['status'].startswith('3'):
            LOGGER.warning('redirect capture came back 200, same-surt same-timestamp capture? %s %s',
                           url, timestamp)
        elif status_code == 302 and capture['status'].startswith('3'):
            # this is OK, wayback always sends a temporary redir
            status_code = int(capture['status'])
            if status_code != resp.status_code and status_code in http_status_text:
                status_reason = http_status_text[status_code]
        else:  # pragma: no cover
            LOGGER.warning('surprised that status code is now=%d orig=%s %s %s',
                           status_code, capture['status'], url, timestamp)

    http_headers = []
    http_date = None
    for k, v in resp.headers.items():
        kl = k.lower()
        if kl.startswith('x-archive-orig-date'):
            http_date = v

        if kl.startswith('x-archive-orig-'):
            k = k[len('x-archive-orig-'):]
            http_headers.append((k, v))
        elif kl == 'content-type':
            http_headers.append(('Content-Type', v))
        elif kl == 'location':
            v = wb_redir_to_original(v)
            http_headers.append((k, v))
        else:
            if not kl.startswith('x-archive-'):
                k = 'X-Archive-' + k
            http_headers.append((k, v))

    statusline = '{} {}'.format(status_code, status_reason)
    http_headers = StatusAndHeaders(statusline, headers=http_headers, protocol='HTTP/1.1')

    warc_headers_dict = {
        'WARC-Source-URI': wb_url,
        'WARC-Creation-Date': datetime_to_iso_date(datetime.datetime.now()),
    }
    if http_date:
        warc_headers_dict['WARC-Date'] = datetime_to_iso_date(http_date_to_datetime(http_date))

    content_bytes = resp.content

    writer = WARCWriter(None)  # needs warc_version here?
    return writer.create_warc_record(url, 'response',
                                     payload=BytesIO(content_bytes),
                                     http_headers=http_headers,
                                     warc_headers_dict=warc_headers_dict)
Exemple #9
0
            args.polls = args.attachments = args.members = True

    with Mkchdir(args.group, sanitize=False):
        log_file_handler = logging.FileHandler('archive.log')
        log_file_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_file_handler)

        if args.warc:
            try:
                from warcio import WARCWriter
            except ImportError:
                logging.error(
                    'WARC output requires the warcio package to be installed.')
                exit(1)
            fhwarc = open('data.warc.gz', 'ab')
            warc_writer = WARCWriter(fhwarc)
            warcmeta = warc_writer.create_warcinfo_record(
                fhwarc.name, WARC_META_PARAMS)
            warc_writer.write_record(warcmeta)
            yga.set_warc_writer(warc_writer)

        if args.email:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop)
        if args.files:
            with Mkchdir('files'):
                archive_files(yga)
        if args.photos:
Exemple #10
0
class Harvester:
    def __init__(self, name, default_delay=3600, n_thread=None):
        file = open(f"{name}.warc.gz", "wb")
        self.writer = WARCWriter(file)
        self.lock = RLock()
        self.scheduler = scheduler(time.time)
        self.executor = ThreadPoolExecutor(n_thread)
        self.default_delay = default_delay
        self.delay_adjusters = {}  # Constant delay
        self.last = {}

    def visit(self, url):
        ts0 = time.time()
        resp = requests.get(url, headers=HEADERS, stream=True)
        ts1 = time.time()
        headers_list = resp.raw.headers.items()
        http_headers = StatusAndHeaders('200 OK',
                                        headers_list,
                                        protocol='HTTP/1.0')
        record = self.writer.create_warc_record(resp.url,
                                                'response',
                                                payload=BytesIO(resp.content),
                                                http_headers=http_headers)
        self.lock.acquire()
        self.writer.write_record(record)
        self.lock.release()
        res = html_to_counters(resp.content)
        return res, (ts1 + ts0) / 2

    def foo(self, target, website, lower_limit, upper_limit):
        try:
            sr1, ts1 = self.visit(website)

            delay_adjuster = self.delay_adjusters[website]
            delay_adjuster.add_case(ts1, sr1)
            estimate = delay_adjuster.get_delay()

            new_delay = max(lower_limit, min(upper_limit, estimate))

            self.scheduler.enterabs(
                ts1 + new_delay, 1, self.executor.submit,
                (self.foo, target, website, lower_limit, upper_limit))

            if logging.root.level <= logging.INFO:
                delay = None
                sim = None
                ts0, sr0 = self.last.get(website, (None, None))
                self.last[website] = (ts1, sr1)
                if ts0 is not None and sr0 is not None:
                    sim = sr0.similarity(sr1)
                    delay = ts1 - ts0
                logging.info(
                    f"website={website}, delay={delay}, sim={sim}, estimate={estimate}, new_delay={new_delay}"
                )
        except Exception as e:
            logging.exception(e)
            logging.exception(f"website={website}")

    def harvest(self, websites, target, delay_adjuster=None):
        logging.info(
            f"Starting harvest with {len(websites)} websites and delay adjuster "
            f"{str(delay_adjuster)}, target={target}")
        start_time = time.time() + 10
        diff = self.default_delay / len(
            websites)  # Distribute initial harvests equally
        for ws in websites:
            if delay_adjuster is None:
                self.delay_adjusters[ws] = ConstantDelayAdjuster(
                    self.default_delay)
            else:
                self.delay_adjusters[ws] = delay_adjuster(
                    self.default_delay, target)
            start_time += diff
            self.scheduler.enterabs(start_time, 1, self.executor.submit,
                                    (self.foo, target, ws, 60, 86400))
        while True:
            try:
                self.scheduler.run()
            except Exception as e:
                logging.exception(e)
                logging.exception(
                    f"Exception occured: {e}, stopping harvest...")
                return
Exemple #11
0
            args.polls = args.attachments = args.members = True

    with Mkchdir(args.group):
        log_file_handler = logging.FileHandler('archive.log')
        log_file_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_file_handler)

        if args.warc:
            try:
                from warcio import WARCWriter
            except ImportError:
                logging.error(
                    'WARC output requires the warcio package to be installed.')
                exit(1)
            fhwarc = open('data.warc.gz', 'ab')
            warc_writer = WARCWriter(fhwarc)
            yga.set_warc_writer(warc_writer)

        if args.email:
            with Mkchdir('email'):
                archive_email(yga)
        if args.files:
            with Mkchdir('files'):
                archive_files(yga)
        if args.photos:
            with Mkchdir('photos'):
                archive_photos(yga)
        if args.database:
            with Mkchdir('databases'):
                archive_db(yga)
        if args.links:
Exemple #12
0
    def handle_download_name(self, user, coll_name, warc_name, url):
        #username = request.query.getunicode('user')

        #warc_name = request.query.getunicode('doi')
        # some clients use collection rather than coll_name so we must check for both
        #coll_name = request.query.getunicode('collection')

        #user = self._get_wasapi_user()

        #self.access.assert_is_curr_user(user)

        #colls = None

        #if coll_name:
        #    collection = user.get_collection_by_name(coll_name)
        #    if collection:
        #        colls = [collection]
        #    else:
        #        self._raise_error(404, 'no_such_collection')

        #else:
        #    colls = user.get_collections()

        #files = []
        user_name = user
        user = self.user_manager.get_user(user)
        collection = user.get_collection_by_name(coll_name)
        if not collection:
            self._raise_error(404, 'no_such_collection')

        self.access.assert_can_write_coll(collection)

        # collection['uid'] = coll
        collection.load()

        Stats(self.redis).incr_download(collection)

        download_path = self.get_origin() + "/api/v1/download/{}/".format(
            user_name)
        warc_name_broke = warc_name.replace("/", "\/")
        warc_name_broke = warc_name.replace("10.25354/", "")
        local_storage = LocalFileStorage(self.redis)
        landingpage = template(
            'webrecorder/templates/landingpage.html',
            title=coll_name,
            warc_file=
            'https://projects.zo.uni-heidelberg.de/webarchive/warc/10.25354/' +
            warc_name_broke + '.warc',
            url=url)
        try:
            os.makedirs(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354'))
            print("Directory '% s' created" % os.path.isfile(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354')))
        except FileExistsError:
            print("Directory '% s' already created!" %
                  os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354'))
        except FileNotFoundError:
            print("Directory '% s' No such file or directory!" %
                  os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354'))
        try:
            os.makedirs(
                os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354'))
            print("Directory '% s' created" % os.path.isfile(
                os.path.join(os.environ['STORAGE_REPLAY'], 'warc',
                             '10.25354')))
        except FileExistsError:
            print(
                "Directory '% s' already created!" %
                os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354'))
        except FileNotFoundError:
            print(
                "Directory '% s' No such file or directory!" %
                os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354'))

        try:
            f = open(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354',
                             warc_name_broke) + ".html", 'w')
            f.write(landingpage)
            f.close()
        except FileExistsError:
            print(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354',
                             warc_name_broke) + ".html exists")
        except FileNotFoundError:
            print(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354',
                             warc_name_broke) + ".html doesn't exists")
        commit_storage = collection.get_storage()

        for recording in collection.get_recordings():
            is_committed = recording.is_fully_committed()
            is_open = not is_committed and recording.get_pending_count() > 0
            storage = commit_storage if is_committed else local_storage
            try:
                f = open(
                    os.path.join(os.environ['STORAGE_REPLAY'], 'warc',
                                 '10.25354', warc_name_broke) + ".warc", 'wb')
                writer = WARCWriter(f, gzip=True)
                for name, path in recording.iter_all_files(
                        include_index=False):
                    local_download = download_path.format(user=user.name,
                                                          coll=collection.name,
                                                          filename=name)
                    warc_key = collection.get_warc_key()
                    warc_path = self.redis.hget(warc_key, name)
                    if 'http://nginx:6090' in warc_path:
                        warc_path = warc_path.replace('http://nginx:6090', '')
                    if 'https://nginx:6090' in warc_path:
                        warc_path = warc_path.replace('https://nginx:6090', '')
                    if not warc_path:
                        self._raise_error(404, 'file_not_found')
                    with open(warc_path, 'rb') as stream:
                        for record in ArchiveIterator(stream):
                            writer.write_record(record)
                f.close()
            except FileExistsError:
                print(
                    os.path.join(os.environ['STORAGE_REPLAY'], 'warc',
                                 '10.25354', warc_name_broke) + ".warc exists")
            except FileNotFoundError:
                print(
                    os.path.join(os.environ['STORAGE_REPLAY'], 'warc',
                                 '10.25354', warc_name_broke) +
                    ".warc doesn't exists")
Exemple #13
0
def sample_warc_stream(ratio, warc_in, warc_out, options):
    if options.language != ANY_LANGUAGE:
        from langdetect import DetectorFactory, detect_langs
        DetectorFactory.seed = options.seed  # Make langdetect deterministic

    writer = WARCWriter(warc_out, gzip=True)

    responses, total, errors, empties, notlang = 0, 0, 0, 0, 0
    for total, record in enumerate(ArchiveIterator(warc_in), start=1):
        if total % 10000 == 0:
            print(
                f'sample_warc_responses.py: processed {total} records, '
                f'{responses} responses, {errors} errors, {empties} empty, '
                f'{notlang} not in target language.',
                file=sys.stderr)

        if record.rec_type != 'response':
            continue

        responses += 1
        id_ = get_id(record)

        if random.random() > ratio:
            continue

        if options.language != ANY_LANGUAGE:
            # Workaround for https://github.com/webrecorder/warcio/issues/114
            payload_copy = BytesIO(record.raw_stream.read())
            record = copy_warc_record(record, payload_copy)
            content = record.content_stream().read()
            # force length recalculation to work around issues with
            # header encoding changes causing content-length mismatch
            # (related: https://github.com/webrecorder/warcio/issues/104)
            record.length = None
            payload_copy.seek(0)
            try:
                text_content = trafilatura.extract(content)
            except Exception as e:
                logging.error(f'failed extract for {id_}: {e}')
                errors += 1
                continue

            if not text_content:
                empties += 1
                continue

            try:
                langs = detect_langs(text_content)
            except Exception as e:
                logging.error(f'failed langdetect for {id_}: {e}')
                errors += 1
                continue

            target_lang = [l for l in langs if l.lang == options.language]
            target_lang = None if not target_lang else target_lang[0]
            if target_lang is None or target_lang.prob < options.lang_prob:
                notlang += 1
                continue
        try:
            writer.write_record(record)
        except Exception as e:
            logging.error(f'failed to write record {id_}: {e}')
            errors += 1
Exemple #14
0
def main():
    args = parse_arguments()

    # Setup logging
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)

    log_format = {
        'fmt': '%(asctime)s %(levelname)s %(name)s %(message)s',
        'datefmt': '%Y-%m-%d %H:%M:%S.%f %Z'
    }
    log_formatter = CustomFormatter(**log_format)

    if args.verbose:
        log_level = logging.DEBUG
    elif args.quiet:
        log_level = logging.ERROR
    else:
        log_level = logging.INFO
    if args.colour:
        try:
            import coloredlogs
        except ImportError as e:
            sys.exit(
                "Error: Coloured logging output requires the 'coloredlogs' package to be installed."
            )
        coloredlogs.install(level=log_level, **log_format)
    else:
        log_stdout_handler = logging.StreamHandler(sys.stdout)
        log_stdout_handler.setLevel(log_level)
        log_stdout_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_stdout_handler)

    cookie_jar = init_cookie_jar(args.cookie_file, args.cookie_t,
                                 args.cookie_y, args.cookie_e)

    headers = {}
    if args.user_agent:
        headers['User-Agent'] = args.user_agent

    yga = YahooGroupsAPI(args.group, cookie_jar, headers, min_delay=args.delay)

    # Default to all unique content. This includes topics and raw email,
    # but not the full email download since that would duplicate html emails we get through topics.
    if not (args.email or args.files or args.photos or args.database
            or args.links or args.calendar or args.about or args.polls
            or args.attachments or args.members or args.topics or args.raw):
        args.files = args.photos = args.database = args.links = args.calendar = args.about = \
            args.polls = args.attachments = args.members = args.topics = args.raw = True

    with Mkchdir(args.group, sanitize=False):
        log_file_handler = logging.FileHandler('archive.log', 'a', 'utf-8')
        log_file_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_file_handler)

        if args.warc:
            try:

                from warcio import WARCWriter
                fhwarc = open('data.warc.gz', 'ab')
                warc_writer = WARCWriter(fhwarc)
                warcmeta = warc_writer.create_warcinfo_record(
                    fhwarc.name, WARC_META_PARAMS)
                warc_writer.write_record(warcmeta)
                yga.set_warc_writer(warc_writer)
            except ImportError:
                logging.error(
                    'WARC output requires the warcio package to be installed.')
                exit(1)
        if args.overwrite:
            hacky_vars['file'] = True
        if args.email:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop,
                              noAttachments=args.noattachments)
        if args.files:
            with Mkchdir('files'):
                archive_files(yga)
        if args.photos:
            with Mkchdir('photos'):
                archive_photos(yga)
        if args.topics:
            with Mkchdir('topics'):
                archive_topics(yga, noAttachments=args.noattachments)
        if args.raw:
            with Mkchdir('email'):
                archive_email(yga,
                              message_subset=args.ids,
                              start=args.start,
                              stop=args.stop,
                              skipHTML=True)
        if args.database:
            with Mkchdir('databases'):
                archive_db(yga)
        if args.links:
            with Mkchdir('links'):
                archive_links(yga)
        if args.about:
            with Mkchdir('about'):
                archive_about(yga)
        if args.polls:
            with Mkchdir('polls'):
                archive_polls(yga)
        if args.attachments:
            with Mkchdir('attachments'):
                archive_attachments(yga)
        if args.members:
            with Mkchdir('members'):
                archive_members(yga)
        if args.calendar:
            with Mkchdir('calendar'):
                archive_calendar(yga)
        if args.warc:
            fhwarc.close()