def test_init_2(self): filename = os.path.join(self.root_dir, 'redir2.warc.gz') with open(filename, 'wb') as fh: self.writer = WARCWriter(fh, gzip=True) redirect = self.create_redirect_record( 'http://www.example.com/path', 'https://www.example.com/path/', '20191003115920') redirect = self.create_redirect_record( 'https://www.example.com/path/', 'https://www2.example.com/path', '20191003115927', status='302') response = self.create_response_record( 'https://www2.example.com/path', '20191024125646', 'Some Text') revisit = self.create_revisit_record( 'https://www2.example.com/path', '20191024125648', 'https://www2.example.com/path', response.rec_headers['WARC-Date']) wb_manager(['init', 'redir2']) wb_manager(['add', 'redir2', filename]) assert os.path.isfile( os.path.join(self.root_dir, self.COLLS_DIR, 'redir2', 'indexes', 'index.cdxj'))
def test_init_1(self): filename = os.path.join(self.root_dir, 'redir.warc.gz') with open(filename, 'wb') as fh: self.writer = WARCWriter(fh, gzip=True) redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '20180626101112') redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '20180626101112') response = self.create_response_record('https://www.example.com/', '20180626101112', 'Some Text') revisit = self.create_revisit_record( 'https://example.com/path', '20190626101112', 'https://example.com/abc', response.rec_headers['WARC-Date']) revisit = self.create_revisit_record( 'https://www.example.com/', '20190626101112', 'https://www.example.com/', response.rec_headers['WARC-Date']) wb_manager(['init', 'redir']) wb_manager(['add', 'redir', filename]) assert os.path.isfile( os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes', 'index.cdxj'))
def test_acl_match(self, capsys): wb_manager( ['acl', 'match', self.acl_filename, 'http://abc.example.com/foo']) out, err = capsys.readouterr() assert out == """\
def test_remove_acl_user(self): wb_manager([ 'acl', 'remove', self.acl_filename, 'com,example)/', '-u', 'public' ]) with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\
def test_acl_add_with_user(self): wb_manager([ 'acl', 'add', self.acl_filename, 'http://example.com/', 'block', '-u', 'public' ]) with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\
def test_acl_add_exact(self): wb_manager([ 'acl', 'add', '--exact-match', self.acl_filename, 'example.com', 'block' ]) with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\
def test_import_errors(self): # missing access mode with raises(SystemExit): wb_manager(['acl', 'importtxt', self.acl_filename, 'foo']) # no such file with raises(SystemExit): wb_manager(['acl', 'importtxt', self.acl_filename, 'foo', 'exclude'])
def test_acl_match_unknown_user(self, capsys): wb_manager([ 'acl', 'match', self.acl_filename, 'http://example.com/foo', '-u', 'data' ]) out, err = capsys.readouterr() assert out == """\
def test_redir_init_slash(self): filename = os.path.join(self.root_dir, 'redir-slash.warc.gz') with open(filename, 'wb') as fh: self.writer = WARCWriter(fh, gzip=True) response = self.create_response_record('https://www.example.com/sub/path/', '201806026101112', 'Sub Path Data') response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '201806026101112', 'Sub Path Data Q') wb_manager(['add', 'redir', filename])
def test_acl_list(self, capsys): wb_manager(['acl', 'list', self.acl_filename]) out, err = capsys.readouterr() assert out == """\ Rules for %s from %s: com,example, - {"access": "exclude", "url": "com,example,"} com,example)/ - {"access": "allow", "url": "http://example.com/"} """ % (self.acl_filename, self.acl_filename)
def test_validate_and_sort_acl(self): with open(self.acl_filename, 'at') as fh: fh.write('com,example)/subpath - {"access": "block", "url": "http://example.com/subpath"}\n') with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\ com,example)/ - {"access": "allow", "url": "http://example.com/"} com,example)/subpath - {"access": "block", "url": "http://example.com/subpath"} """ wb_manager(['acl', 'validate', self.acl_filename]) with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\
def add_collection(*, folder): arc_folder = Path(CONFIG.arc_source_directory).absolute() / folder if not os.path.exists(arc_folder): raise ValueError wdir = Path(CONFIG.working_directory).absolute() do_add = True with cd(wdir): try: wb_manager(["init", folder]) except FileExistsError: do_add = False if do_add: data = {c: [] for c in CollectionTracker.COLUMNS} paths = [] for arc_path in find_arcs(arc_folder): paths.append(arc_path) wb_manager(["add", folder, *paths]) with open(wdir / "collections" / folder / "indexes" / "index.cdxj") as index_file: for line in index_file: surt, ts, js = line.split(maxsplit=2) js = json.loads(js) reg = r"^https?:\/\/[\w.]+no(:\d+)?\/?$" mime = js.get("mime", "") status = js.get("status", "") url = js.get("url", "") fn = js.get("filename", "") digest = js.get("digest", "") if mime == "text/html" and status == "200" and re.search(reg, url): line = (fn, ts, url, Verdict.UNDECIDED.value, digest, "") for c, val in zip(CollectionTracker.COLUMNS, line): data[c].append(val) df = pd.DataFrame(data) collection_state[folder] = CollectionTracker(folder, df)
def test_importtxt_acl(self, capsys): name = os.path.join(self.root_dir, 'excludes.txt') with open(name, 'wt') as exc: exc.write('http://iana.org/\n') exc.write('http://example.com/subpath/another\n') exc.write('http://example.co/foo/\n') exc.write('http://example.com/\n') wb_manager(['acl', 'importtxt', self.acl_filename, name, 'exclude']) with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\ org,iana)/ - {"access": "exclude", "url": "http://iana.org/"} com,example)/subpath/another - {"access": "exclude", "url": "http://example.com/subpath/another"} com,example)/subpath - {"access": "block", "url": "http://example.com/subpath"} com,example)/ - {"access": "exclude", "url": "http://example.com/"} co,example)/foo - {"access": "exclude", "url": "http://example.co/foo/"} """ out, err = capsys.readouterr() assert 'Added or replaced 4 rules from {0}'.format(name) in out, out os.remove(name)
def test_remove_acl_exact(self): wb_manager(['acl', 'remove', '-e', self.acl_filename, 'https://example.com/']) with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\
def test_acl_list_err_no_such_file(self): with raises(SystemExit): wb_manager(['acl', 'list', self.acl_filename + '2'])
def test_acl_add_surt(self): wb_manager(['acl', 'add', self.acl_filename, 'com,example,', 'exclude']) with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\
def test_acl_add(self): wb_manager(['acl', 'add', self.acl_filename, 'http://example.com/', 'allow']) with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\
def test_acl_add_err_wrong_access(self): with raises(SystemExit): wb_manager(['acl', 'add', self.acl_filename, 'http://example.com/', 'access'])