Exemple #1
0
def bios_from_wet_url(url, verbose=False):
    try:
        time0 = time.time()
        log("TRYING "+url)
        r = requests.get(url, stream=True)

        assert r.status_code == 200, f"*** Got status code {r.status_code} != 200"

        if verbose:
            print(f"Status code {r.status_code} for {url}")

        it = ArchiveIterator(fileobj=r.raw)
        it.__next__()

        ans = dedup_exact([bio for record in it for bio in
               extract_bios_from_page(record.content_stream().read().decode()[:MAX_PAGE_LEN], record.rec_headers.get_header('WARC-Target-URI'))])
        log(f"DONE {url} {time.time()-time0:.1f} seconds")
        return ans

    except Exception as e:
        print(f"*** Exception in {url}:", file=sys.stderr)
        print(f"*** {e}", file=sys.stderr)
        print(f"***", file=sys.stderr)
        print("", file=sys.stderr)
        return None
Exemple #2
0
 def get_record(self, url):
     reqv_resp_pair = self.url_index.get(url)
     if reqv_resp_pair is not None:
         self._stream.seek(reqv_resp_pair[0][0])
         reqv = next(iter(ArchiveIterator(self._stream)))
         self._stream.seek(reqv_resp_pair[1][0])
         resp = next(iter(ArchiveIterator(self._stream)))
         return reqv, resp
     else:
         raise KeyError('The request or response is missing from the archive for URL: {0}'.format(url))
    def deduplicate(self):
        self._log.log('Start deduplication process.')

        iaData = {} # dict of (payload digest, URL) => IA response|None
        with open(self.warc_source, 'rb') as s:
            for record in ArchiveIterator(s):
                if record.rec_headers.get_header('WARC-Type') == 'response':
                    iaData[(record.rec_headers.get_header('WARC-Payload-Digest'), record.rec_headers.get_header('WARC-Target-URI'))] = None

        self.fetch_from_ia(iaData)

        with open(self.warc_source, 'rb') as s, \
                open(self.warc_target, 'wb') as t:
            writer = WARCWriter(filebuf=t, gzip=self.warc_target.endswith('.gz'))
            for record in ArchiveIterator(s):
                url = record.rec_headers.get_header('WARC-Target-URI')
                record_id = record.rec_headers.get_header('WARC-Record-ID')
                self._log.log('Processing record {}.'.format(record_id))
                if url is not None and url.startswith('<'):
                    url = re.search('^<(.+)>$', url).group(1)
                    self._log.log('Replacing URL in record {} with {}.'
                                  .format(record_id, url))
                    record.rec_headers.replace_header('WARC-Target-URI', url)
                if record.rec_headers.get_header('WARC-Type') == 'response':
                    self._log.log('Deduplicating record {}.'.format(record_id))
                    key = (record.rec_headers.get_header('WARC-Payload-Digest'), record.rec_headers.get_header('WARC-Target-URI'))
                    assert key in iaData
                    if iaData[key]:
                        self._log.log('Record {} is a duplicate from {}.'
                                      .format(record_id, iaData[key]))
                        writer.write_record(
                            self.response_to_revisit(writer, record, iaData[key])
                        )
                    else:
                        if iaData[key] is False:
                            self._log.log('Record {} could not be deduplicated.'
                                .format(record_id))
                        else:
                            self._log.log('Record {} is not a duplicate.'
                                .format(record_id))
                        self.register_response(record)
                        writer.write_record(record)
                elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
                    self._log.set_warcinfo(record.rec_headers.get_header('WARC-Record-ID'))
                    record.rec_headers.replace_header('WARC-Filename', self.warc_target)
                    writer.write_record(record)
                else:
                    writer.write_record(record)
            self._log.log('Writing log to WARC.')
            writer.write_record(self._log.create_record(writer))
Exemple #4
0
def run(url, outPath, timeLimit, agent, filetypes, warcfilename, wait):
    cmd = ""
    if timeLimit:
        cmd += "timeout {} ".format(timeLimit)
    waitoption = ""
    if wait is not None:
        waitoption = "--wait " + wait
    agentoption = ""
    if agent is not None:
        agentoption = "--user-agent \"" + agent + "\""

    filetypesoption = ""
    if filetypes is not None:
        filetypesoption = "-A \"" + filetypes + "\""

    warcoption = ""
    warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")]
    if warcfilename is not None:
        warcoption = "--warc-file \"" + warcfilebasename + "\""

    if check_wget_compression("wget --help | grep 'no-warc-compression'"):
        warcoption += " --no-warc-compression"

    cmd += "wget --mirror {WAIT} {FILETYPES} -q {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format(
        WAIT=waitoption,
        FILETYPES=filetypesoption,
        URL=url,
        DOWNLOAD_PATH=outPath,
        AGENT=agentoption,
        WARC=warcoption)
    # print("cmd", cmd)
    try:
        system_check(cmd)
        with open(warcfilebasename + ".warc", 'rb') as f_in:
            with open(warcfilebasename + ".warc.gz", 'wb') as f_out:
                writer = WARCWriter(f_out, gzip=True)
                for record in ArchiveIterator(f_in):
                    writer.write_record(record)
    except subprocess.CalledProcessError as grepexc:
        with open(warcfilebasename + ".warc", 'rb') as f_in:
            with open(warcfilebasename + ".warc.gz", 'wb') as f_out:
                writer = WARCWriter(f_out, gzip=True)
                for record in ArchiveIterator(f_in):
                    writer.write_record(record)
                # try except here

        sys.stderr.write(
            "Warning: Some files could not be downloaded with wget\n")

    system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
def decode(x, record_attribute):
    html_pages_array = []
    _, payload = x
    wholeTextFile = ''.join([c.encode('utf-8') for c in payload])
    wholeTextFile = "WARC/1.0 " + wholeTextFile

    from cStringIO import StringIO
    from warcio.archiveiterator import ArchiveIterator
    from html2text import HTML2Text
    from bs4 import BeautifulSoup
    stream = StringIO(wholeTextFile)
    try:
        for record in ArchiveIterator(stream):
            # if the record type is a response (which is the case for html page)

                if record.rec_type == 'response':
                    # check if the response is http
                    if record.http_headers != None:
                        # Get the WARC-RECORD-ID
                        record_id = record.rec_headers.get_header(record_attribute)
                        # Clean up the HTML using BeautifulSoup
                        html = record.content_stream().read()
                        soup = BeautifulSoup(html, "html5lib")
                        data = soup.findAll(text=True)
                        result = filter(visible, data)
                        result2 = ' '.join(result)
                        result2 = ' '.join(result2.split()).encode('utf-8')
                        # Build up the resulting list.
                        # result2 = re.sub(r'[\?\.\!]+(?=[\?\.\!])', '.', result2)
                        html_pages_array.append((record_id, result2))
    except Exception:
        print("Something went wrong with the archive entry")

    return html_pages_array
Exemple #6
0
def warcToText(url):
# request the url/warc.gz file
    resp = requests.get(url, stream=True)
    # iterate through the archive
    fail = 0
    succeed  = 0
    for record in ArchiveIterator(resp.raw, arc2warc=True):
        # if the record type is a response (which is the case for html page)
        if record.rec_type == 'response':
            # check if the response is http
            if record.http_headers != None:
                # if the http header is one of the following
                if ((record.http_headers.get_header('Content-Type') =='text/html') |(record.http_headers.get_header('Content-Type') == 'text/html; charset=UTF-8')\
                 | (record.http_headers.get_header('Content-Type') =='text/html; charset=utf-8')| (record.http_headers.get_header('Content-Type') =='text/html; charset=ISO-8859-1')\
                 | (record.http_headers.get_header('Content-Type') =='charset=iso-8859-1')):
                    # return the html page
                    try:
                        html = record.content_stream().read()
                        # from html to plain text
                        html_parse = html5lib.parseFragment(html)
                        s = ''.join(html_parse.itertext())
                        print(s)
                        succeed = succeed +1
                    except Exception:
                        fail = fail +1
                        continue
    print('fail: %s'%(fail))
    print('succeed: %s'%(succeed))
Exemple #7
0
def handleOneSegment(warc_file_path, site_list, is_fake=1):
    connection = pymysql.connect(host='localhost',
                                 port=8889,
                                 user='******',
                                 password='******',
                                 db='gingko',
                                 cursorclass=pymysql.cursors.DictCursor)
    with open(warc_file_path, 'rb') as f:
        for record in ArchiveIterator(f):
            if record.rec_type == 'response':
                headers = record.__dict__['http_headers'].headers
                content_type = ""
                for h in headers:
                    if h[0] == 'Content-Type':
                        content_type = h[1]
                        break
                if not content_type.startswith("text/html"):
                    continue
                html = record.content_stream().read().decode("cp437")
                rec_headers = record.__dict__['rec_headers'].headers
                for h in rec_headers:
                    if h[0] == 'WARC-Target-URI':
                        if h[1].startswith("http://") or h[1].startswith("https://"):
                            site = matchSite(site_list, h[1])
                            if site:
                                storeInSQL(connection, site, headers, rec_headers, is_fake, html)
    connection.close()
Exemple #8
0
def download_row(row, language, prefix):
    """
    This method gets the filtered WARC result.

    row:      row object, what is returned bhe CC index. Contains filename, length, offset
    language: language currently used, not really necessary but useful for data structure
    prefix:   prefix of the S3 dataset, usually https://commoncrawl.s3.amazonaws.com/
    """
    url = prefix + row.filename
    url = url.replace(',', '').replace('}', '')  # Strip infrequent last characters

    # This is necessary since the CC data sometimes (but not always) contains a comma
    offset = int(row.offset.replace(',', ''))
    end = offset + int(row.length.replace(',', ''))  # end = offset + length

    headers = {"Range": "bytes={}-{}".format(offset, end)}

    try:
        # Download WARC bestand voor dit specifieke domein
        resp = requests.get(url, headers=headers, stream=True)

        for record in ArchiveIterator(resp.raw, arc2warc=True):
            try:
                item = FilteredItem(Item(record.content_stream().read()))
                if not item.filter_out:
                    if item.to_detect == '':
                        raise Exception('Empty item %s %s %s' % (item.filter_out, item.to_detect, url))
                    return language, item.to_detect
            except Exception as e:
                logging.debug('Skipping record %s, got exception %s', record, e)
    except Exception as e:
        logging.info('Warc Error: %s', e)
        return
Exemple #9
0
def extract_record(cmd):
    with open(cmd.filename, 'rb') as fh:
        fh.seek(int(cmd.offset))
        it = iter(ArchiveIterator(fh))
        record = next(it)

        try:
            stdout_raw = sys.stdout.buffer
        except AttributeError:
            stdout_raw = sys.stdout

        if cmd.payload:
            stream = record.content_stream()
            buf = stream.read(65536)
            while buf:
                stdout_raw.write(buf)
                buf = stream.read(65536)
        else:
            stdout_raw.write(record.rec_headers.to_bytes())
            if record.http_headers:
                stdout_raw.write(record.http_headers.to_bytes())
            if not cmd.headers:
                buf = record.raw_stream.read(65536)
                while buf:
                    stdout_raw.write(buf)
                    buf = record.raw_stream.read(65536)
Exemple #10
0
def test_log ():
    logger = Logger ()

    with NamedTemporaryFile() as fd:
        with WarcHandler (fd, logger) as handler:
            warclogger = WarcHandlerConsumer (handler)
            logger.connect (warclogger)
            golden = []

            assert handler.log.tell () == 0
            golden.append (logger.info (foo=1, bar='baz', encoding='äöü⇔ΓΨ'))
            assert handler.log.tell () != 0

            handler.maxLogSize = 0
            golden.append (logger.info (bar=1, baz='baz'))
            # should flush the log
            assert handler.log.tell () == 0

        fd.seek (0)
        for it in ArchiveIterator (fd):
            headers = it.rec_headers
            assert headers['warc-type'] == 'metadata'
            assert 'warc-target-uri' not in headers
            assert headers['x-crocoite-type'] == 'log'
            assert headers['content-type'] == f'application/json; charset={handler.logEncoding}'

            while True:
                l = it.raw_stream.readline ()
                if not l:
                    break
                data = json.loads (l.strip ())
                assert data == golden.pop (0)
    def download_url(self, url):
        text = None
        reqv_resp_pair = self._internal_url_index.get(url)
        if reqv_resp_pair is not None:
            offset = reqv_resp_pair[1][
                0]  # Only need the offset of the response part
            self._stream.seek(
                offset
            )  # Can not be cached as we also want to write it out to the new archive!
            record = next(
                iter(
                    ArchiveIterator(self._stream,
                                    check_digests=self._check_digest)))
            data = record.content_stream().read()
            assert len(data) > 0
            enc = record.rec_headers.get_header('WARC-X-Detected-Encoding',
                                                'UTF-8')
            text = data.decode(enc, 'ignore')
        else:
            self._logger.log('CRITICAL',
                             url,
                             'URL not found in WARC!',
                             sep='\t')

        return text
    def read_memento(self, murl=None):
        """
        This function is for reading memento content.

        Parameters:
            murl (str):URI-M

        Returns:
            (str): Content on Success and None on Failure
        """
        mpath = self.lookup_memento(murl)
        response = Utils.get_murl_info(murl, self.__thandle)
        if mpath:
            if self.__constants.WARC_EXT in mpath:
                try:
                    with open(mpath, 'rb') as stream:
                        for record in ArchiveIterator(stream):
                            if record.rec_type == 'response':
                                if self.__config.debug: sys.stdout.write(str(murl["uri"]) + " Content Size: " + str(record.rec_headers.get_header('Content-Length')) + "\n")
                                if (int(response["timestamp"]) < 20090101000000 and int(record.rec_headers.get_header('Content-Length')) < 1000) or (int(response["timestamp"]) > 20200101000000 and int(record.rec_headers.get_header('Content-Length')) < 100000):
                                    return None
                                else:
                                    return record.content_stream().read()


                except Exception as e:
                    sys.stderr.write("Memento Read Error: " + str(e) + "\n")
            elif ".html" in mpath:
                try:
                    with open(mpath, "r") as stream:
                        return stream.read()
                except Exception as e:
                    sys.stderr.write("Memento Read Error: " + str(e) + "\n")
        return None
Exemple #13
0
def test_different_payload(writer):
    """
    Duplicate URL, but different payload
    """

    records = []
    for i in range (2):
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
        warcHeaders = {}
        record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        records.append (record)

        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
        record = writer.create_warc_record ('http://example.com/', 'response',
                payload=BytesIO(f'data{i}'.encode ('utf8')),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        records.append (record)

    for r in records:
        writer.write_record (r)

    output = NamedTemporaryFile()
    mergeWarc ([writer.out.name], output)

    output.seek(0)
    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
Exemple #14
0
def test_unmodified(writer):
    """
    Single request/response pair, no revisits
    """

    records = []

    httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
    warcHeaders = {}
    record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
    records.append (record)

    httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
    record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'),
            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
    records.append (record)

    for r in records:
        writer.write_record (r)

    output = NamedTemporaryFile()
    mergeWarc ([writer.out.name], output)

    output.seek(0)
    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
Exemple #15
0
def test_resp_revisit_same_url(writer):
    """
    Duplicate record for the same URL, creates a revisit
    """

    records = []
    for i in range (2):
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
        warcHeaders = {}
        record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        records.append (record)

        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
        record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        records.append (record)

    for r in records:
        writer.write_record (r)

    dup = records.pop ()
    ref = records[1]
    records.append (makeRevisit (writer, ref, dup))

    output = NamedTemporaryFile()
    mergeWarc ([writer.out.name], output)

    output.seek(0)
    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
Exemple #16
0
def process_warc_archive(warc_path):
    sentry_sdk.init(
        "https://[email protected]/1409316")

    start = time.time()
    processed_records = 0
    ignored_records = 0
    match_stats = dict()

    fs = s3fs.S3FileSystem(anon=True)
    warc_file = fs.open('commoncrawl/%s' % warc_path, 'rb')

    for i, record in enumerate(ArchiveIterator(warc_file, arc2warc=True)):
        _should_process_record, data = should_process_record(record)
        if not _should_process_record:
            ignored_records += 1
            continue

        headers, body = data

        process_record(warc_path, record, headers, body, match_stats)

        processed_records += 1

        if i % REPORT_STATUS_EVERY == 0:
            report_status(i, start, processed_records, ignored_records,
                          warc_path)

    spent = time.time() - start
    return warc_path, spent, processed_records, ignored_records, match_stats
Exemple #17
0
 def _read_first_response(self, filename):
     with open(get_test_file(filename), 'rb') as fh:
         for record in ArchiveIterator(fh):
             if record.rec_type == 'response':
                 return record.content_stream().read()
             else:
                 record.content_stream().read()
Exemple #18
0
    def process_warcs(self, id_, iterator):
        s3pattern = re.compile('^s3://([^/]+)/(.+)')
        base_dir = os.path.abspath(os.path.dirname(__file__))

        # S3 client (not thread-safe, initialize outside parallelized loop)
        no_sign_request = botocore.client.Config(
            signature_version=botocore.UNSIGNED)
        s3client = boto3.client('s3', config=no_sign_request)

        for uri in iterator:
            self.warc_input_processed.add(1)
            if uri.startswith('s3://'):
                self.get_logger().info('Reading from S3 {}'.format(uri))
                s3match = s3pattern.match(uri)
                if s3match is None:
                    self.get_logger().error("Invalid S3 URI: " + uri)
                    continue
                bucketname = s3match.group(1)
                path = s3match.group(2)
                warctemp = TemporaryFile(mode='w+b',
                                         dir=self.args.local_temp_dir)
                try:
                    s3client.download_fileobj(bucketname, path, warctemp)
                except botocore.client.ClientError as exception:
                    self.get_logger().error('Failed to download {}: {}'.format(
                        uri, exception))
                    self.warc_input_failed.add(1)
                    warctemp.close()
                    continue
                warctemp.seek(0)
                stream = warctemp
            elif uri.startswith('hdfs://'):
                self.get_logger().error("HDFS input not implemented: " + uri)
                continue
            else:
                self.get_logger().info('Reading local stream {}'.format(uri))
                if uri.startswith('file:'):
                    uri = uri[5:]
                uri = os.path.join(base_dir, uri)
                try:
                    stream = open(uri, 'rb')
                except IOError as exception:
                    self.get_logger().error('Failed to open {}: {}'.format(
                        uri, exception))
                    self.warc_input_failed.add(1)
                    continue

            no_parse = (not self.warc_parse_http_header)
            try:
                for record in ArchiveIterator(stream,
                                              no_record_parse=no_parse):
                    for res in self.process_record(record):
                        yield res
                    self.records_processed.add(1)
            except ArchiveLoadFailed as exception:
                self.warc_input_failed.add(1)
                self.get_logger().error('Invalid WARC: {} - {}'.format(
                    uri, exception))
            finally:
                stream.close()
Exemple #19
0
    def process(self, item):
        filename_in = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item
        filename_out = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item

        with open(filename_in, 'rb') as file_in:
            with open(filename_out, 'wb') as file_out:
                writer = WARCWriter(filebuf=file_out, gzip=True)
                for record in ArchiveIterator(file_in):
                    if record.rec_headers.get_header(
                            'WARC-Type') == 'response':
                        record_url = record.rec_headers.get_header(
                            'WARC-Target-URI')
                        record_digest = record.rec_headers.get_header(
                            'WARC-Payload-Digest')
                        ia_record = self.ia_available(record_url,
                                                      record_digest)
                        #print(ia_record)
                        if not ia_record:
                            writer.write_record(record)
                        else:
                            print('Found duplicate, writing revisit record.')
                            writer.write_record(
                                self.revisit_record(writer, record, ia_record))
                    else:
                        writer.write_record(record)
Exemple #20
0
    def test_capture_to_temp_file_append(self):
        full_path = os.path.join(self.temp_dir, 'example.warc.gz')

        url = 'http://localhost:{0}/get?foo=bar'.format(self.port)

        with capture_http(full_path):
            res = requests.get(url)

        with capture_http(full_path):
            res = requests.get(url)

        with open(full_path, 'rb') as stream:
            # response
            ai = ArchiveIterator(stream)
            response = next(ai)
            assert response.rec_type == 'response'
            assert response.rec_headers['WARC-Target-URI'] == url

            # request
            request = next(ai)
            assert request.rec_type == 'request'
            assert request.rec_headers['WARC-Target-URI'] == url

            response = next(ai)
            assert response.rec_type == 'response'
            assert response.rec_headers['WARC-Target-URI'] == url

            # request
            request = next(ai)
            assert request.rec_type == 'request'
            assert request.rec_headers['WARC-Target-URI'] == url

        os.remove(full_path)
    def test_unseekable(self):
        """ Test iterator on unseekable 3 record uncompressed WARC input
        """
        proc = subprocess.Popen(
            ['cat', get_test_file('example-iana.org-chunked.warc')],
            stdout=subprocess.PIPE)
        with closing(ArchiveIterator(proc.stdout)) as a:
            for record in a:
                assert record.rec_type == 'warcinfo'
                assert a.get_record_offset() == 0
                break

            record = next(a)
            assert record.rec_type == 'response'
            assert a.get_record_offset() == 405

            for record in a:
                assert record.rec_type == 'request'
                assert a.get_record_offset() == 8379
                break

            with pytest.raises(StopIteration):
                record = next(a)

        assert a.record == None
        assert a.reader == None
        assert a.read_to_end() == None

        proc.stdout.close()
        proc.wait()
def process(filename_in, filename_out):
    starttime = datetime.now()
    dedupemiss = 0
    dedupehit = 0
    with open(filename_in, 'rb') as file_in:
        with open(filename_out, 'wb') as file_out:
            writer = WARCWriter(filebuf=file_out, gzip=True)
            for record in ArchiveIterator(file_in):
                if record.rec_headers.get_header('WARC-Type') == 'response':
                    record_url = record.rec_headers.get_header(
                        'WARC-Target-URI')
                    record_digest = record.rec_headers.get_header(
                        'WARC-Payload-Digest')
                    ia_record = ia_available(record_url, record_digest)
                    if not ia_record:
                        writer.write_record(record)
                    else:
                        print('Found duplicate, writing revisit record.')
                        writer.write_record(
                            revisit_record(writer, record, ia_record))
                        dedupehit = dedupehit + 1
                else:
                    writer.write_record(record)
                    dedupemiss = dedupemiss + 1
    print(str(dedupehit) + " Hits")
    print(str(dedupemiss) + " Misses")
    print("took " + str(datetime.now() - starttime) + " to execute")
Exemple #23
0
    def test_post_stream(self):
        warc_writer = BufferWARCWriter(gzip=False)

        def nop_filter(request, response, warc_writer):
            assert request
            assert response
            return request, response

        postbuff = BytesIO(b'somedatatopost')

        url = 'http://localhost:{0}/post'.format(self.port)

        with capture_http(warc_writer, nop_filter):
            res = requests.post(url, data=postbuff)

        # response
        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == url
        assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'

        assert res.json() == json.loads(response.content_stream().read().decode('utf-8'))

        # request
        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == url
        assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'

        data = request.content_stream().read().decode('utf-8')
        assert data == 'somedatatopost'
    def test_unseekable_gz(self):
        """ Test iterator on unseekable 3 record uncompressed gzipped WARC input
        """
        proc = subprocess.Popen(
            ['cat', get_test_file('example-resource.warc.gz')],
            stdout=subprocess.PIPE)
        with closing(ArchiveIterator(proc.stdout)) as a:
            for record in a:
                assert record.rec_type == 'warcinfo'
                assert a.get_record_offset() == 0
                break

            record = next(a)
            assert record.rec_type == 'warcinfo'
            assert a.get_record_offset() == 361

            for record in a:
                assert record.rec_type == 'resource'
                assert a.get_record_offset() == 802
                break

            with pytest.raises(StopIteration):
                record = next(a)

        assert a.record == None
        assert a.reader == None
        assert a.read_to_end() == None

        proc.stdout.close()
        proc.wait()
Exemple #25
0
 def process(self, item):
     digests = {}
     input_filename = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item
     output_filename = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item
     with open(input_filename, 'rb') as f_in, \
             open(output_filename, 'wb') as f_out:
         writer = WARCWriter(filebuf=f_out, gzip=True)
         for record in ArchiveIterator(f_in):
             url = record.rec_headers.get_header('WARC-Target-URI')
             if url is not None and url.startswith('<'):
                 url = re.search('^<(.+)>$', url).group(1)
                 record.rec_headers.replace_header('WARC-Target-URI', url)
             if record.rec_headers.get_header('WARC-Type') == 'response':
                 digest = record.rec_headers.get_header('WARC-Payload-Digest')
                 if digest in digests:
                     writer.write_record(
                         self._record_response_to_revisit(writer, record,
                                                          digests[digest])
                     )
                 else:
                     digests[digest] = (
                         record.rec_headers.get_header('WARC-Record-ID'),
                         record.rec_headers.get_header('WARC-Date'),
                         record.rec_headers.get_header('WARC-Target-URI')
                     )
                     writer.write_record(record)
             elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
                 record.rec_headers.replace_header('WARC-Filename', output_filename)
                 writer.write_record(record)
             else:
                 writer.write_record(record)
 def _find_first_by_type(self, filename, match_type, **params):
     with open(get_test_file(filename), 'rb') as fh:
         with closing(ArchiveIterator(fh, **params)) as a:
             for record in a:
                 if record.rec_type == match_type:
                     yield record
                     break
Exemple #27
0
 def read_warc_archive(self, archive_path):
     with open(archive_path, 'rb') as stream:
         for record in ArchiveIterator(stream):
             if record.rec_type == 'response':
                 try:
                     parser = BeautifulSoup(record.content_stream().read(),
                                            features="html.parser")
                 except:
                     continue
                 links = parser.find_all("a")
                 if links:
                     for link in links:
                         href = link.attrs.get("href")
                         if href is not None:
                             if self.domain in href and href.startswith(
                                     "http"):
                                 path = urlparse(href).path
                                 domain_link = self.proper_domain + path
                                 self.data.append({
                                     '{0}_link'.format(self.domain_name):
                                     domain_link,
                                     'reference_link':
                                     record.rec_headers.get_header(
                                         'WARC-TARGET-URI'),
                                     'warc_date':
                                     dateutil.parser.parse(
                                         record.rec_headers.get_header(
                                             'WARC-Date'))
                                 })
    def test_iterator(self):
        """ Test iterator semantics on 3 record WARC
        """
        with open(get_test_file('example-iana.org-chunked.warc'), 'rb') as fh:
            with closing(ArchiveIterator(fh)) as a:
                for record in a:
                    assert record.rec_type == 'warcinfo'
                    assert a.get_record_offset() == 0
                    assert record.digest_checker.passed is None
                    assert len(record.digest_checker.problems) == 0
                    break

                record = next(a)
                assert record.rec_type == 'response'
                assert a.get_record_offset() == 405
                assert record.digest_checker.passed is None
                assert len(record.digest_checker.problems) == 0

                for record in a:
                    assert record.rec_type == 'request'
                    assert a.get_record_offset() == 8379
                    assert record.digest_checker.passed is None
                    assert len(record.digest_checker.problems) == 0
                    break

                with pytest.raises(StopIteration):
                    record = next(a)

        assert a.record == None
        assert a.reader == None
        assert a.read_to_end() == None
    def __process_warc_gz_file(self, path_name):
        """
        Iterates all transactions in one WARC file and for each transaction tries to extract an article object.
        Afterwards, each article is checked against the filter criteria and if all are passed, the function
        on_valid_article_extracted is invoked with the article object.
        :param path_name:
        :return:
        """
        counter_article_total = 0
        counter_article_passed = 0
        counter_article_discarded = 0
        start_time = time.time()

        def read_record(record):
            record.read_stream = record.raw_stream.read()
            return record

        def delete_raw_stream(record):
            del record.raw_stream
            return record

        with open(path_name, 'rb') as stream:
            warc_records = list(map(read_record, ArchiveIterator(stream)))
            self.__logger.info('Extracting {} records'.format(
                len(warc_records)))

        warc_records = list(map(delete_raw_stream, warc_records))

        self.__callback_on_article_extracted(warc_records)
Exemple #30
0
def proxy():
    iapath = request.args['iapath']
    rng = request.args['range']

    item, fn = iapath.split('/')

    meta = requests.get(f"https://archive.org/metadata/{item}").json()

    r = requests.get(f"https://{meta['d1']}{meta['dir']}/{fn}",
                     headers={"Range": f"bytes={rng}"},
                     stream=True)
    for record in ArchiveIterator(r.raw, arc2warc=True):
        if record.rec_type == 'response':

            def stream():
                stream = record.content_stream()
                buf = stream.read(8192)
                while buf:
                    yield buf
                    buf = stream.read(8192)
                r.close()

            return Response(
                stream(),
                mimetype=record.http_headers.get_header('Content-Type'))

    return abort(404)
Exemple #31
0
    def parse_uploaded(self, stream, expected_size):
        """Parse WARC archive.

        :param stream: file object
        :param int expected_size: expected WARC archive size

        :returns: list of recordings (indices)
        :rtype: list
        """
        arciterator = ArchiveIterator(stream,
                                      no_record_parse=True,
                                      verify_http=True,
                                      block_size=BLOCK_SIZE)
        infos = []

        last_indexinfo = None
        indexinfo = None
        is_first = True
        remote_archives = None

        for record in arciterator:
            warcinfo = None
            if record.rec_type == 'warcinfo':
                try:
                    warcinfo = self.parse_warcinfo(record)
                except Exception as e:
                    print('Error Parsing WARCINFO')
                    traceback.print_exc()

            elif remote_archives is not None:
                source_uri = record.rec_headers.get('WARC-Source-URI')
                if source_uri:
                    if self.wam_loader:
                        res = self.wam_loader.find_archive_for_url(source_uri)
                        if res:
                            remote_archives.add(res[2])

            arciterator.read_to_end(record)

            if last_indexinfo:
                last_indexinfo['offset'] = arciterator.member_info[0]
                last_indexinfo = None

            if warcinfo and 'json-metadata' in warcinfo:
                self.add_index_info(infos, indexinfo, arciterator.member_info[0])

                indexinfo = warcinfo.get('json-metadata')
                indexinfo['offset'] = None

                if 'title' not in indexinfo:
                    indexinfo['title'] = 'Uploaded Recording'

                if 'type' not in indexinfo:
                    indexinfo['type'] = 'recording'

                indexinfo['ra'] = set()
                remote_archives = indexinfo['ra']

                last_indexinfo = indexinfo

            elif is_first:
                indexinfo = {'type': 'recording',
                             'title': 'Uploaded Recording',
                             'offset': 0,
                            }

            if is_first and warcinfo and 'software' in warcinfo:
                indexinfo['warcinfo:software'] = warcinfo['software']
                indexinfo['warcinfo:datetime'] = record.rec_headers.get('WARC-Date')

            is_first = False

        if indexinfo:
            self.add_index_info(infos, indexinfo, stream.tell())

        # if anything left over, likely due to WARC error, consume remainder
        if stream.tell() < expected_size:
            while True:
                buff = stream.read(8192)
                if not buff:
                    break

        return infos