Esempio n. 1
0
    def test_iterator(self):
        """ Test iterator semantics on 3 record WARC
        """
        with open(get_test_file('example-iana.org-chunked.warc'), 'rb') as fh:
            a = ArchiveIterator(fh)
            for record in a:
                assert record.rec_type == 'warcinfo'
                break

            record = next(a)
            assert record.rec_type == 'response'

            for record in a:
                assert record.rec_type == 'request'
                break

            with pytest.raises(StopIteration):
                record = next(a)

        assert a.record == None
        assert a.read_to_end() == None
Esempio n. 2
0
    def parse_uploaded(self, stream, expected_size):
        """Parse WARC archive.

        :param stream: file object
        :param int expected_size: expected WARC archive size

        :returns: list of recordings (indices)
        :rtype: list
        """
        arciterator = ArchiveIterator(stream,
                                      no_record_parse=True,
                                      verify_http=True,
                                      block_size=BLOCK_SIZE)
        infos = []

        last_indexinfo = None
        indexinfo = None
        is_first = True
        remote_archives = None

        for record in arciterator:
            warcinfo = None
            if record.rec_type == 'warcinfo':
                try:
                    warcinfo = self.parse_warcinfo(record)
                except Exception as e:
                    print('Error Parsing WARCINFO')
                    traceback.print_exc()

            elif remote_archives is not None:
                source_uri = record.rec_headers.get('WARC-Source-URI')
                if source_uri:
                    if self.wam_loader:
                        res = self.wam_loader.find_archive_for_url(source_uri)
                        if res:
                            remote_archives.add(res[2])

            arciterator.read_to_end(record)

            if last_indexinfo:
                last_indexinfo['offset'] = arciterator.member_info[0]
                last_indexinfo = None

            if warcinfo and 'json-metadata' in warcinfo:
                self.add_index_info(infos, indexinfo,
                                    arciterator.member_info[0])

                indexinfo = warcinfo.get('json-metadata')
                indexinfo['offset'] = None

                if 'title' not in indexinfo:
                    indexinfo['title'] = 'Uploaded Recording'

                if 'type' not in indexinfo:
                    indexinfo['type'] = 'recording'

                indexinfo['ra'] = set()
                remote_archives = indexinfo['ra']

                last_indexinfo = indexinfo

            elif is_first:
                indexinfo = {
                    'type': 'recording',
                    'title': 'Uploaded Recording',
                    'offset': 0,
                }

            if is_first and warcinfo and 'software' in warcinfo:
                indexinfo['warcinfo:software'] = warcinfo['software']
                indexinfo['warcinfo:datetime'] = record.rec_headers.get(
                    'WARC-Date')

            is_first = False

        if indexinfo:
            self.add_index_info(infos, indexinfo, stream.tell())

        # if anything left over, likely due to WARC error, consume remainder
        if stream.tell() < expected_size:
            while True:
                buff = stream.read(8192)
                if not buff:
                    break

        return infos
Esempio n. 3
0
    def parse_uploaded(self, stream, expected_size):
        arciterator = ArchiveIterator(stream,
                                      no_record_parse=True,
                                      verify_http=True,
                                      block_size=BLOCK_SIZE)
        infos = []

        last_indexinfo = None
        indexinfo = None
        is_first = True

        for record in arciterator:
            warcinfo = None
            if record.rec_type == 'warcinfo':
                try:
                    warcinfo = self.parse_warcinfo(record)
                except Exception as e:
                    print('Error Parsing WARCINFO')
                    traceback.print_exc()

            arciterator.read_to_end(record)

            if last_indexinfo:
                last_indexinfo['offset'] = arciterator.member_info[0]
                last_indexinfo = None

            if warcinfo:
                self.add_index_info(infos, indexinfo,
                                    arciterator.member_info[0])

                indexinfo = warcinfo.get('json-metadata')
                indexinfo['offset'] = None

                if 'title' not in indexinfo:
                    indexinfo['title'] = 'Uploaded Recording'

                if 'type' not in indexinfo:
                    indexinfo['type'] = 'recording'

                last_indexinfo = indexinfo

            elif is_first:
                indexinfo = {
                    'type': 'recording',
                    'title': 'Uploaded Recording',
                    'offset': 0,
                }

            is_first = False

        if indexinfo:
            self.add_index_info(infos, indexinfo, stream.tell())

        # if anything left over, likely due to WARC error, consume remainder
        if stream.tell() < expected_size:
            while True:
                buff = stream.read(8192)
                if not buff:
                    break

        return infos
Esempio n. 4
0
    def parse_uploaded(self, stream, expected_size):
        """Parse WARC archive.

        :param stream: file object
        :param int expected_size: expected WARC archive size

        :returns: list of recordings (indices)
        :rtype: list
        """
        arciterator = ArchiveIterator(stream,
                                      no_record_parse=True,
                                      verify_http=True,
                                      block_size=BLOCK_SIZE)
        infos = []

        last_indexinfo = None
        indexinfo = None
        is_first = True
        remote_archives = None

        for record in arciterator:
            warcinfo = None
            if record.rec_type == 'warcinfo':
                try:
                    warcinfo = self.parse_warcinfo(record)
                except Exception as e:
                    print('Error Parsing WARCINFO')
                    traceback.print_exc()

            elif remote_archives is not None:
                source_uri = record.rec_headers.get('WARC-Source-URI')
                if source_uri:
                    if self.wam_loader:
                        res = self.wam_loader.find_archive_for_url(source_uri)
                        if res:
                            remote_archives.add(res[2])

            arciterator.read_to_end(record)

            if last_indexinfo:
                last_indexinfo['offset'] = arciterator.member_info[0]
                last_indexinfo = None

            if warcinfo and 'json-metadata' in warcinfo:
                self.add_index_info(infos, indexinfo, arciterator.member_info[0])

                indexinfo = warcinfo.get('json-metadata')
                indexinfo['offset'] = None

                if 'title' not in indexinfo:
                    indexinfo['title'] = 'Uploaded Recording'

                if 'type' not in indexinfo:
                    indexinfo['type'] = 'recording'

                indexinfo['ra'] = set()
                remote_archives = indexinfo['ra']

                last_indexinfo = indexinfo

            elif is_first:
                indexinfo = {'type': 'recording',
                             'title': 'Uploaded Recording',
                             'offset': 0,
                            }

            if is_first and warcinfo and 'software' in warcinfo:
                indexinfo['warcinfo:software'] = warcinfo['software']
                indexinfo['warcinfo:datetime'] = record.rec_headers.get('WARC-Date')

            is_first = False

        if indexinfo:
            self.add_index_info(infos, indexinfo, stream.tell())

        # if anything left over, likely due to WARC error, consume remainder
        if stream.tell() < expected_size:
            while True:
                buff = stream.read(8192)
                if not buff:
                    break

        return infos