Beispiel #1
0
 def process(self, item):
     digests = {}
     input_filename = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item
     output_filename = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item
     with open(input_filename, 'rb') as f_in, \
             open(output_filename, 'wb') as f_out:
         writer = WARCWriter(filebuf=f_out, gzip=True)
         for record in ArchiveIterator(f_in):
             url = record.rec_headers.get_header('WARC-Target-URI')
             if url is not None and url.startswith('<'):
                 url = re.search('^<(.+)>$', url).group(1)
                 record.rec_headers.replace_header('WARC-Target-URI', url)
             if record.rec_headers.get_header('WARC-Type') == 'response':
                 digest = record.rec_headers.get_header('WARC-Payload-Digest')
                 if digest in digests:
                     writer.write_record(
                         self._record_response_to_revisit(writer, record,
                                                          digests[digest])
                     )
                 else:
                     digests[digest] = (
                         record.rec_headers.get_header('WARC-Record-ID'),
                         record.rec_headers.get_header('WARC-Date'),
                         record.rec_headers.get_header('WARC-Target-URI')
                     )
                     writer.write_record(record)
             elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
                 record.rec_headers.replace_header('WARC-Filename', output_filename)
                 writer.write_record(record)
             else:
                 writer.write_record(record)
Beispiel #2
0
    def build_writer(self):
        """
        Initialize a new WARC file and write the "warcinfo" header.
        """
        directory = self.settings.get('WARC_FILE_DIRECTORY', '.')
        filename = self.build_filename()

        if self.debug:
            fp = sys.stdout.buffer
        else:
            fp = open(os.path.join(directory, filename), 'wb')

        logger.debug(f"Generating WARC file {filename}")
        writer = WARCWriter(
            fp,
            gzip=self.settings.getbool('WARC_GZIP', True),
            warc_version=self.settings['WARC_VERSION'],
        )

        headers = {
            'hostname': self.hostname,
            'ip': self.ip_address,
            'http-header-user-agent': self.settings["USER_AGENT"],
            'robots': 'classic' if self.settings["ROBOTSTXT_OBEY"] else 'none',
            'operator': self.settings.get("WARC_OPERATOR"),
            'software': self.settings.get("WARC_SOFTWARE"),
            'isPartOf': self.settings.get("WARC_IS_PART_OF"),
            'description': self.settings.get("WARC_DESCRIPTION"),
            'format': self.settings.get("WARC_FORMAT"),
            'conformsTo': self.settings.get("WARC_CONFORMS_TO"),
        }
        warcinfo_record = writer.create_warcinfo_record(filename, headers)
        writer.write_record(warcinfo_record)
        return writer
Beispiel #3
0
def fetch_urls_to_warc(urls, warcfile_path):
    """Fetch urls and write to warc file

    :urls: list of urls to binary files
    :warcfile_path: path to a WARC file.

    """

    with open(warcfile_path, 'wb') as output:
        writer = WARCWriter(output, gzip=True)

        for url in urls:
            print(url)
            resp = requests.get(url,
                                headers={'Accept-Encoding': 'identity'},
                                stream=True)

            headers_list = resp.raw.headers.items()
            http_headers = StatusAndHeaders('200 OK',
                                            headers_list,
                                            protocol='HTTP/1.0')
            record = writer.create_warc_record(url,
                                               'response',
                                               payload=resp.raw,
                                               http_headers=http_headers)
            writer.write_record(record)
Beispiel #4
0
def run(url, out_path, time_limit, agent, filetypes, warcfilename, wait):
    cmd = ""
    if time_limit:
        cmd += "timeout {} ".format(time_limit)
    waitoption = ""
    if wait is not None:
        waitoption = "--wait " + wait
    agentoption = ""
    if agent is not None:
        agentoption = "--user-agent \"" + agent + "\""

    filetypesoption = ""
    if filetypes is not None:
        filetypesoption = "-A \"" + filetypes + "\""

    warcoption = ""
    warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")]
    if warcfilename is not None:
        warcoption = "--warc-file \"" + warcfilebasename + "\""

    if check_wget_compression("wget --help | grep 'no-warc-compression'"):
        warcoption += " --no-warc-compression"

    cmd += "wget --mirror {WAIT} {FILETYPES} -q -o /dev/null {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format(
        WAIT=waitoption,
        FILETYPES=filetypesoption,
        URL=url,
        DOWNLOAD_PATH=out_path,
        AGENT=agentoption,
        WARC=warcoption)
    # print("cmd", cmd)
    try:
        system_check(cmd)
    except subprocess.CalledProcessError as grepexc:
        sys.stderr.write(
            "Warning: Some files could not be downloaded with wget\n")

    with open(warcfilebasename + ".warc", 'rb') as f_in:
        with open(warcfilebasename + ".warc.gz", 'wb') as f_out:
            writer = WARCWriter(f_out, gzip=True)
            try:
                for record in ArchiveIterator(f_in):
                    if record.http_headers:
                        if record.http_headers.get_header(
                                'Transfer-Encoding') == "chunked":
                            continue
                        try:
                            record.http_headers.to_ascii_bytes()
                        except UnicodeEncodeError:
                            # if header is non ascii, create a new header, with status code only
                            # content length and content type will be filled before writing
                            record.http_headers = StatusAndHeaders(
                                record.http_headers.get_statuscode(), [])
                    record.length = None
                    writer.write_record(record)
            except:
                pass

    system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
Beispiel #5
0
 def download_one(self, path: Path, source: str) -> None:
     logging.info(f"Fetching {source} for {self.name}")
     source_rows = self.fetch_source_rows(source)
     with AtomicFileWriter(path) as output:
         writer = WARCWriter(output, gzip=True)
         logging.info(f"Downloading {source}")
         for warc in fetch_all_cc(source_rows, self.nthread,
                                  self.disable_progress):
             writer.write_record(warc)
    def load_and_write(self, stream, output):
        with open(output, 'wb') as out:
            writer = WARCWriter(filebuf=out, gzip=True)

            for record in ArchiveIterator(stream,
                                          no_record_parse=True,
                                          arc2warc=True,
                                          verify_http=False):

                writer.write_record(record)
    def write_memento(self, murl=None):
        """
        This is function to write memento in WARC format.

        Parameters:
            murl (str): URI-M

        Returns:
            (bool): True on Success and False on Failure
        """
        try:
            if self.lookup_memento(murl):
                return True
            else:
                response = Utils.get_murl_info(murl, self.__thandle)
                mpath = self.__memento_dir
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["handle"].lower())
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["domain"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["archive"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["wrep"] + response["lang"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                try:
                    mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT)
                    with open(mpath, "wb") as output:
                        writer = WARCWriter(output, gzip=True)
                        resp = requests.get(murl,
                                            headers={'Accept-Encoding': 'identity'},
                                            stream=True, timeout=120)

                        # get raw headers from urllib3
                        headers_list = resp.raw.headers.items()
                        http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1')
                        record = writer.create_warc_record(mpath, 'response',
                                                           payload=resp.raw,
                                                           http_headers=http_headers)
                        writer.write_record(record)
                    return True
                except requests.exceptions.TooManyRedirects as err:
                    sys.stderr.write(murl + "Too Many redirects" + "\n")
                except requests.exceptions.ConnectTimeout as err:
                    sys.stderr.write(murl + "Connection Timeout" + "\n")
                except Exception as e:
                    sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n")
        except Exception as e:
            sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n")
        return False
Beispiel #8
0
    def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True,
                 err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2,
                 limit_period=1, proxy_url=None, allow_cookies=False):
        if known_bad_urls is not None:  # Setup the list of cached bad URLs to prevent trying to download them again
            with open(known_bad_urls, encoding='UTF-8') as fh:
                self.bad_urls = {line.strip() for line in fh}
        else:
            self.bad_urls = set()

        if not overwrite_warc:  # Find out next nonexisting warc filename
            num = 0
            while os.path.exists(filename):
                filename2, ext = os.path.splitext(filename)  # Should be filename.warc.gz
                if ext == '.gz' and filename2.endswith('.warc'):
                    filename2, ext2 = os.path.splitext(filename2)  # Should be filename.warc
                    ext = ext2 + ext  # Should be .warc.gz

                filename = '{0}-{1:05d}{2}'.format(filename2, num, ext)
                num += 1

        logger_.log('INFO', 'Creating archivefile: {0}'.format(filename))

        self._output_file = open(filename, 'wb')
        self._logger_ = logger_
        self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent}

        self._session = Session()  # Setup session for speeding up downloads

        if proxy_url is not None:  # Set socks proxy if provided
            self._session.proxies['http'] = proxy_url
            self._session.proxies['https'] = proxy_url

        self._allow_cookies = allow_cookies

        # Setup rate limiting to prevent hammering the server
        self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period,
                                                    period=limit_period)(self._http_get_w_cookie_handling))
        self._error_count = 0
        self._error_threshold = err_threshold  # Set the error threshold which cause aborting to prevent deinal

        self._writer = WARCWriter(self._output_file, gzip=True)
        if warcinfo_record_data is None:
            # INFO RECORD
            # Some custom information about the warc writer program and its settings
            info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]),
                            'format': 'WARC File Format 1.0',
                            'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'}
            info_record = self._writer.create_warcinfo_record(filename, info_headers)
        else:  # Must recreate custom headers else they will not be copied
            custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\
                             encode('UTF-8')
            info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0],
                                                          payload=BytesIO(custom_headers),
                                                          length=len(custom_headers))
        self._writer.write_record(info_record)
Beispiel #9
0
 def process(self, item):
     digests = {}
     input_filename = "%(item_dir)s/%(warc_file_base)s.warc" % item
     output_filename = "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz" % item
     with open(input_filename, 'rb') as f_in, \
             open(output_filename, 'wb') as f_out:
         writer = WARCWriter(filebuf=f_out, gzip=True)
         for record in ArchiveIterator(f_in):
             url = record.rec_headers.get_header('WARC-Target-URI')
             if url is not None and url.startswith('<'):
                 url = re.search('^<(.+)>$', url).group(1)
                 record.rec_headers.replace_header('WARC-Target-URI', url)
             if record.rec_headers.get_header('WARC-Type') == 'response':
                 digest = record.rec_headers.get_header('WARC-Payload-Digest')
                 if digest in digests:
                     writer.write_record(
                         self._record_response_to_revisit(writer, record,
                                                          digests[digest])
                     )
                 else:
                     digests[digest] = (
                         record.rec_headers.get_header('WARC-Record-ID'),
                         record.rec_headers.get_header('WARC-Date'),
                         record.rec_headers.get_header('WARC-Target-URI')
                     )
                     writer.write_record(record)
             elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
                 record.rec_headers.replace_header('WARC-Filename', output_filename)
                 writer.write_record(record)
             else:
                 writer.write_record(record)
Beispiel #10
0
def main(args):
    output = open(args.output, 'wb')
    writer = WARCWriter(output, gzip=True)

    with open(args.input, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if 'WARC-Target-URI' in record.rec_headers:
                record.rec_headers['WARC-Target-URI'] = record.rec_headers[
                    'WARC-Target-URI'].lstrip('<').rstrip('>')
            writer.write_record(record)

    output.close()
Beispiel #11
0
def mergeWarc():
    """
    Merge multiple WARC files into a single file, writing revisit records for
    items which occur multiple times
    """

    parser = argparse.ArgumentParser(
        description='Merge WARCs, reads filenames from stdin.')
    parser.add_argument('--verbose', '-v', action='store_true')
    parser.add_argument('output',
                        type=argparse.FileType('wb'),
                        help='Output WARC')

    args = parser.parse_args()
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=loglevel)

    unique = 0
    revisit = 0
    payloadMap = {}
    writer = WARCWriter(args.output, gzip=True)
    for l in sys.stdin:
        l = l.strip()
        with open(l, 'rb') as fd:
            for record in ArchiveIterator(fd):
                if record.rec_type in {'resource', 'response'}:
                    headers = record.rec_headers
                    rid = headers.get_header('WARC-Record-ID')
                    csum = headers.get_header('WARC-Payload-Digest')
                    dup = payloadMap.get(csum, None)
                    if dup is None:
                        payloadMap[csum] = {
                            'uri': headers.get_header('WARC-Target-URI'),
                            'id': rid,
                            'date': headers.get_header('WARC-Date')
                        }
                        unique += 1
                    else:
                        logging.debug('Record {} is duplicate of {}'.format(
                            rid, dup['id']))
                        record = writer.create_revisit_record(
                            dup['uri'], csum, dup['uri'], dup['date'])
                        record.rec_headers.add_header('WARC-Truncated',
                                                      'length')
                        record.rec_headers.add_header('WARC-Refers-To',
                                                      dup['id'])
                        revisit += 1
                else:
                    unique += 1
                writer.write_record(record)
    logging.info('Wrote {} unique records, {} revisits'.format(
        unique, revisit))
Beispiel #12
0
 def open(self):
     filename = self.prefix
     if self.subprefix:
         filename += '-' + self.subprefix
     serial = self.get_serial(filename)
     filename += '-' + serial + '-' + self.hostname + '.warc'
     if self.gzip:
         filename += '.gz'
     self.filename = filename
     self.f = open(filename, 'wb')
     self.writer = WARCWriter(self.f, gzip=self.gzip)
     record = self.writer.create_warcinfo_record(self.filename, self.info)
     self.writer.write_record(record)
def process(filename_in, filename_out):
    starttime = datetime.now()
    dedupemiss = 0
    dedupehit = 0
    with open(filename_in, 'rb') as file_in:
        with open(filename_out, 'wb') as file_out:
            writer = WARCWriter(filebuf=file_out, gzip=True)
            for record in ArchiveIterator(file_in):
                if record.rec_headers.get_header('WARC-Type') == 'response':
                    record_url = record.rec_headers.get_header(
                        'WARC-Target-URI')
                    record_digest = record.rec_headers.get_header(
                        'WARC-Payload-Digest')
                    ia_record = ia_available(record_url, record_digest)
                    if not ia_record:
                        writer.write_record(record)
                    else:
                        print('Found duplicate, writing revisit record.')
                        writer.write_record(
                            revisit_record(writer, record, ia_record))
                        dedupehit = dedupehit + 1
                else:
                    writer.write_record(record)
                    dedupemiss = dedupemiss + 1
    print(str(dedupehit) + " Hits")
    print(str(dedupemiss) + " Misses")
    print("took " + str(datetime.now() - starttime) + " to execute")
    def run(self):

        with open(self.warcfile, 'ab') as output:
            while True:
                self.lock.acquire()
                data = self.out_queue.get()
                writer = WARCWriter(output, gzip=False)
                headers_list = data[0]
                http_headers = StatusAndHeaders('{} {}'.format(
                    data[3], data[4]),
                                                headers_list,
                                                protocol='HTTP/1.0')
                record = writer.create_warc_record(data[2],
                                                   'response',
                                                   payload=data[1],
                                                   http_headers=http_headers)
                h = hashlib.sha1()
                h.update(record.raw_stream.read(BLOCK_SIZE))
                if self.dedup.lookup(h.hexdigest()):
                    record = writer.create_warc_record(
                        data[2], 'revisit', http_headers=http_headers)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
                else:
                    self.dedup.save(h.hexdigest(), data[2])
                    record.raw_stream.seek(0)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
Beispiel #15
0
    def __init__(self, fd, logger):
        self.logger = logger
        self.writer = WARCWriter(fd, gzip=True)

        self.logEncoding = 'utf-8'
        self.log = BytesIO()
        # max log buffer size (bytes)
        self.maxLogSize = 500 * 1024

        # maps document urls to WARC record ids, required for DomSnapshotEvent
        # and ScreenshotEvent
        self.documentRecords = {}
        # record id of warcinfo record
        self.warcinfoRecordId = None
Beispiel #16
0
    def process(self, item):
        filename_in = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item
        filename_out = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item

        with open(filename_in, 'rb') as file_in:
            with open(filename_out, 'wb') as file_out:
                writer = WARCWriter(filebuf=file_out, gzip=True)
                for record in ArchiveIterator(file_in):
                    if record.rec_headers.get_header(
                            'WARC-Type') == 'response':
                        record_url = record.rec_headers.get_header(
                            'WARC-Target-URI')
                        record_digest = record.rec_headers.get_header(
                            'WARC-Payload-Digest')
                        ia_record = self.ia_available(record_url,
                                                      record_digest)
                        #print(ia_record)
                        if not ia_record:
                            writer.write_record(record)
                        else:
                            print('Found duplicate, writing revisit record.')
                            writer.write_record(
                                self.revisit_record(writer, record, ia_record))
                    else:
                        writer.write_record(record)
Beispiel #17
0
 def open(self):
     filename = self.prefix
     if self.subprefix:
         filename += '-' + str(
             self.subprefix)  # don't let yaml leave this as an int
     serial = self.get_serial(filename)
     filename += '-' + serial + '-' + self.hostname + '.warc'
     if self.gzip:
         filename += '.gz'
     self.filename = filename
     self.f = open(filename, 'wb')
     self.writer = WARCWriter(self.f, gzip=self.gzip)
     record = self.writer.create_warcinfo_record(self.filename, self.info)
     self.warcinfo_id = record.rec_headers.get_header('WARC-Record-ID')
     self.writer.write_record(record)
Beispiel #18
0
    def test_init_2(self):
        filename = os.path.join(self.root_dir, 'redir2.warc.gz')
        with open(filename, 'wb') as fh:
            self.writer = WARCWriter(fh, gzip=True)

            redirect = self.create_redirect_record(
                'http://www.example.com/path', 'https://www.example.com/path/',
                '20191003115920')
            redirect = self.create_redirect_record(
                'https://www.example.com/path/',
                'https://www2.example.com/path',
                '20191003115927',
                status='302')
            response = self.create_response_record(
                'https://www2.example.com/path', '20191024125646', 'Some Text')
            revisit = self.create_revisit_record(
                'https://www2.example.com/path', '20191024125648',
                'https://www2.example.com/path',
                response.rec_headers['WARC-Date'])

        wb_manager(['init', 'redir2'])

        wb_manager(['add', 'redir2', filename])

        assert os.path.isfile(
            os.path.join(self.root_dir, self.COLLS_DIR, 'redir2', 'indexes',
                         'index.cdxj'))
Beispiel #19
0
    def run(self):
        if self.use_magic:
            if not self.load_magic():
                return 1

        try:
            output = open(self.name, self.mode)
        except FileExistsError as e:
            self.logger.error(e)
            self.logger.error(
                '* Use -a/--append to append to an existing WARC file')
            self.logger.error(
                '* Use -o/--overwrite to overwrite existing WARC file')
            return 1

        with closing(output):
            writer = WARCWriter(output, gzip=self.gzip)

            self.make_warcinfo(writer)

            for file_info in self.iter_inputs():
                self.make_record(writer, file_info)

        self.logger.info('Wrote {0} resources to {1}'.format(
            self.count, self.name))
        return 0
Beispiel #20
0
    def __init__(self, writer, gzip=True):
        self.fh = None
        self.writer = None
        self.filename = 'unknown'
        self.is_first = True

        if isinstance(writer, BaseWARCWriter):
            self.writer = writer
        elif isinstance(writer, str):
            self.fh = open(writer, 'wb')
            self.filename = writer
            self.writer = WARCWriter(self.fh, gzip=gzip)
        elif hasattr(writer, 'write'):
            self.writer = WARCWriter(writer, gzip=gzip)
        else:
            raise Exception('writer is in an unknown format')
Beispiel #21
0
    def test_init_1(self):
        filename = os.path.join(self.root_dir, 'redir.warc.gz')
        with open(filename, 'wb') as fh:
            self.writer = WARCWriter(fh, gzip=True)

            redirect = self.create_redirect_record('http://example.com/',
                                                   'https://example.com/',
                                                   '20180626101112')
            redirect = self.create_redirect_record('https://example.com/',
                                                   'https://www.example.com/',
                                                   '20180626101112')
            response = self.create_response_record('https://www.example.com/',
                                                   '20180626101112',
                                                   'Some Text')

            revisit = self.create_revisit_record(
                'https://example.com/path', '20190626101112',
                'https://example.com/abc', response.rec_headers['WARC-Date'])
            revisit = self.create_revisit_record(
                'https://www.example.com/', '20190626101112',
                'https://www.example.com/', response.rec_headers['WARC-Date'])

        wb_manager(['init', 'redir'])

        wb_manager(['add', 'redir', filename])

        assert os.path.isfile(
            os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes',
                         'index.cdxj'))
Beispiel #22
0
    def run(self):
        if self.use_magic == 'magic':
            if not self.load_magic():
                return 1
        if self.use_tika:
            if not self.load_tika():
                return 1
        if self.use_mapfile:
            if not self.load_mapfile():
                return 1
        if self.use_logfile:
            if not self.init_logfile():
                return 1

        try:
            output = warcio.utils.open(self.name, self.mode)
        except OSError as e:
            # ensure only file exists handling
            if e.errno != errno.EEXIST:
                raise

            self.logger.error(e)
            self.logger.error(
                '* Use -a/--append to append to an existing WARC file')
            self.logger.error(
                '* Use -o/--overwrite to overwrite existing WARC file')
            return 1

        with closing(output):
            writer = WARCWriter(output, gzip=self.gzip)

            self.make_warcinfo(writer)

            for file_info in self.iter_inputs():
                result = self.make_record(writer, file_info)
                if not result:
                    self.logger.debug('Skipping {0}'.format(file_info.url))
                    continue

                url, record = result

                # Current file serves as a directory index
                if url.lower().endswith(self.index_files):
                    self.make_index_revisit(writer, url, record)

                if self.conversion_serializer:
                    self.make_conversions(writer, url, record)

                if self.transclusion_serializer:
                    self.make_transclusion_metadata(writer, url, record)

        self.logger.info('Wrote {0} resources to {1}'.format(
            self.count, self.name))

        self.close_logfile()

        return 0
Beispiel #23
0
    def process(self, item):
        records = {}
        num_records = 0

        filename_in = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item
        filename_out = '%(data_dir)s/%(warc_file_base)s.warc.gz' % item

        with open(filename_in, 'rb') as file_in:
            with open(filename_out, 'wb') as file_out:
                writer = WARCWriter(filebuf=file_out, gzip=True)
                for record in ArchiveIterator(file_in):
                    num_records += 1
                    writer.write_record(self.process_record(writer, record,
                                                            filename_out,
                                                            records))

        print('Processed {} payloads, found {} unique payloads.'
              .format(num_records, len(records)))
Beispiel #24
0
def extract_record(cmd):
    records = cmd.records.split(',')
    writer = WARCWriter(filebuf=sys.stdout.buffer, gzip=False)
    for filename in cmd.inputs:
        with open(filename, 'rb') as fh:
            for record in ArchiveIterator(fh,
                                          no_record_parse=True,
                                          arc2warc=True):
                if record.format == 'arc':
                    rec_uri = record.rec_headers.get_header('uri')
                elif record.format in ('warc', 'arc2warc'):
                    rec_uri = record.rec_headers.get_header('WARC-Target-URI')

                if record.rec_type in records and (
                        cmd.uri is None or rec_uri is None or
                    (cmd.uri is not None and cmd.uri == rec_uri)):
                    writer.write_record(record)
                    if (not cmd.print_all):
                        break
Beispiel #25
0
    def test_redir_init_slash(self):
        filename = os.path.join(self.root_dir, 'redir-slash.warc.gz')
        with open(filename, 'wb') as fh:
            self.writer = WARCWriter(fh, gzip=True)

            response = self.create_response_record('https://www.example.com/sub/path/', '201806026101112', 'Sub Path Data')

            response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '201806026101112', 'Sub Path Data Q')

        wb_manager(['add', 'redir', filename])
Beispiel #26
0
def convert_to_warc(website, filename):
	with open(filename + '.warc.gz', 'wb') as output:
		writer = WARCWriter(output, gzip=True)
		
		resp = requests.get(website,
                        headers={'Accept-Encoding': 'identity'},
                        stream=True)
						
		# get raw headers from urllib3
		headers_list = resp.raw.headers.items()


		http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')

		record = writer.create_warc_record(website, 'response',
                                   payload=resp.raw,
                                   http_headers=http_headers)
								   
		writer.write_record(record)
Beispiel #27
0
    def test_input_manifest(self):
        wet1 = BytesIO()
        writer1 = WARCWriter(wet1, gzip=False)

        write_conversion_record(writer1, 'https://nophonenumbershere.info',
                                b'THIS-IS-NOT-A-NUMBER')
        write_conversion_record(
            writer1, 'https://big.directory/',
            b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n')

        wet2_gz_path = join(self.tmp_dir, 'wet2.warc.wet.gz')
        with open(wet2_gz_path, 'wb') as wet2:
            writer2 = WARCWriter(wet2, gzip=True)

            write_conversion_record(writer2, 'https://jseventplanning.biz/',
                                    b'contact us at +1 201 867 5309')

        self.assertEqual(
            run_job(MRPhoneToURL(['-r', self.RUNNER, wet2_gz_path, '-']),
                    raw_input=wet1.getvalue()), self.EXPECTED_OUTPUT)
Beispiel #28
0
    def create_temp_warc(cls):
        with NamedTemporaryFile(delete=False, suffix='.warc.gz') as fh:
            writer = WARCWriter(fh, gzip=True)

            cls.create_record(writer, 'http://example.com/', 'Example Domain',
                              '20140101000000')
            cls.create_record(writer, 'http://example.com/', 'Example Domain',
                              '20170101000000')

            filename = fh.name

        return filename
Beispiel #29
0
    def __init__(self, reader, writer, gzip=True):
        if isinstance(reader, str):
            with codecs.open(reader, encoding='utf-8') as fh:
                self.har = json.loads(fh.read())
        elif hasattr(reader, 'read'):
            self.har = json.loads(reader.read())
        elif isinstance(reader, dict):
            self.har = reader
        else:
            raise Exception('reader is in an unknown format')

        self.fh = None
        if isinstance(writer, BaseWARCWriter):
            self.writer = writer
        elif isinstance(writer, str):
            self.fh = open(writer, 'wb')
            self.writer = WARCWriter(self.fh, gzip=gzip)
        elif hasattr(writer, 'write'):
            self.writer = WARCWriter(writer, gzip=gzip)
        else:
            raise Exception('writer is in an unknown format')
Beispiel #30
0
def run(url, outPath, timeLimit, agent, filetypes, warcfilename, wait):
    cmd = ""
    if timeLimit:
        cmd += "timeout {} ".format(timeLimit)
    waitoption = ""
    if wait is not None:
        waitoption = "--wait " + wait
    agentoption = ""
    if agent is not None:
        agentoption = "--user-agent \"" + agent + "\""

    filetypesoption = ""
    if filetypes is not None:
        filetypesoption = "-A \"" + filetypes + "\""

    warcoption = ""
    warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")]
    if warcfilename is not None:
        warcoption = "--warc-file \"" + warcfilebasename + "\""

    if check_wget_compression("wget --help | grep 'no-warc-compression'"):
        warcoption += " --no-warc-compression"

    cmd += "wget --mirror {WAIT} {FILETYPES} -q {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format(
        WAIT=waitoption,
        FILETYPES=filetypesoption,
        URL=url,
        DOWNLOAD_PATH=outPath,
        AGENT=agentoption,
        WARC=warcoption)
    # print("cmd", cmd)
    try:
        system_check(cmd)
        with open(warcfilebasename + ".warc", 'rb') as f_in:
            with open(warcfilebasename + ".warc.gz", 'wb') as f_out:
                writer = WARCWriter(f_out, gzip=True)
                for record in ArchiveIterator(f_in):
                    writer.write_record(record)
    except subprocess.CalledProcessError as grepexc:
        with open(warcfilebasename + ".warc", 'rb') as f_in:
            with open(warcfilebasename + ".warc.gz", 'wb') as f_out:
                writer = WARCWriter(f_out, gzip=True)
                for record in ArchiveIterator(f_in):
                    writer.write_record(record)
                # try except here

        sys.stderr.write(
            "Warning: Some files could not be downloaded with wget\n")

    system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
Beispiel #31
0
    def _fetch_warc(self, action_result, url, out_path):
        with open(out_path, "wb") as output:
            writer = WARCWriter(output, gzip=True)

            resp = requests.get(url,
                                headers={"Accept-Encoding": "identity"},
                                stream=True)

            # get raw headers from urllib3
            headers_list = resp.raw.headers.items()

            http_headers = StatusAndHeaders("200 OK",
                                            headers_list,
                                            protocol="HTTP/1.0")

            record = writer.create_warc_record(url,
                                               "response",
                                               payload=resp.raw,
                                               http_headers=http_headers)

            writer.write_record(record)

        return out_path
Beispiel #32
0
    def run(self):

        with open(self.warcfile, 'ab') as output:
            while True:
                self.lock.acquire()
                data = self.out_queue.get()
                writer = WARCWriter(output, gzip=False)
                headers_list = data[0]
                http_headers = StatusAndHeaders('{} {}'.format(data[3], data[4]), headers_list, protocol='HTTP/1.0')
                record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers)
                h = hashlib.sha1()
                h.update(record.raw_stream.read(BLOCK_SIZE))
                if self.dedup.lookup(h.hexdigest()):
                    record = writer.create_warc_record(data[2], 'revisit',
                                                       http_headers=http_headers)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
                else:
                    self.dedup.save(h.hexdigest(), data[2])
                    record.raw_stream.seek(0)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()