Exemple #1
0
    def test_init_2(self):
        filename = os.path.join(self.root_dir, 'redir2.warc.gz')
        with open(filename, 'wb') as fh:
            self.writer = WARCWriter(fh, gzip=True)

            redirect = self.create_redirect_record(
                'http://www.example.com/path', 'https://www.example.com/path/',
                '20191003115920')
            redirect = self.create_redirect_record(
                'https://www.example.com/path/',
                'https://www2.example.com/path',
                '20191003115927',
                status='302')
            response = self.create_response_record(
                'https://www2.example.com/path', '20191024125646', 'Some Text')
            revisit = self.create_revisit_record(
                'https://www2.example.com/path', '20191024125648',
                'https://www2.example.com/path',
                response.rec_headers['WARC-Date'])

        wb_manager(['init', 'redir2'])

        wb_manager(['add', 'redir2', filename])

        assert os.path.isfile(
            os.path.join(self.root_dir, self.COLLS_DIR, 'redir2', 'indexes',
                         'index.cdxj'))
Exemple #2
0
    def test_init_1(self):
        filename = os.path.join(self.root_dir, 'redir.warc.gz')
        with open(filename, 'wb') as fh:
            self.writer = WARCWriter(fh, gzip=True)

            redirect = self.create_redirect_record('http://example.com/',
                                                   'https://example.com/',
                                                   '20180626101112')
            redirect = self.create_redirect_record('https://example.com/',
                                                   'https://www.example.com/',
                                                   '20180626101112')
            response = self.create_response_record('https://www.example.com/',
                                                   '20180626101112',
                                                   'Some Text')

            revisit = self.create_revisit_record(
                'https://example.com/path', '20190626101112',
                'https://example.com/abc', response.rec_headers['WARC-Date'])
            revisit = self.create_revisit_record(
                'https://www.example.com/', '20190626101112',
                'https://www.example.com/', response.rec_headers['WARC-Date'])

        wb_manager(['init', 'redir'])

        wb_manager(['add', 'redir', filename])

        assert os.path.isfile(
            os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes',
                         'index.cdxj'))
Exemple #3
0
    def test_acl_match(self, capsys):
        wb_manager(
            ['acl', 'match', self.acl_filename, 'http://abc.example.com/foo'])

        out, err = capsys.readouterr()

        assert out == """\
Exemple #4
0
    def test_remove_acl_user(self):
        wb_manager([
            'acl', 'remove', self.acl_filename, 'com,example)/', '-u', 'public'
        ])

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
Exemple #5
0
    def test_acl_add_with_user(self):
        wb_manager([
            'acl', 'add', self.acl_filename, 'http://example.com/', 'block',
            '-u', 'public'
        ])

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
Exemple #6
0
    def test_acl_add_exact(self):
        wb_manager([
            'acl', 'add', '--exact-match', self.acl_filename, 'example.com',
            'block'
        ])

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
    def test_import_errors(self):
        # missing access mode
        with raises(SystemExit):
            wb_manager(['acl', 'importtxt', self.acl_filename, 'foo'])

        # no such file
        with raises(SystemExit):
            wb_manager(['acl', 'importtxt', self.acl_filename, 'foo', 'exclude'])
Exemple #8
0
    def test_acl_match_unknown_user(self, capsys):
        wb_manager([
            'acl', 'match', self.acl_filename, 'http://example.com/foo', '-u',
            'data'
        ])

        out, err = capsys.readouterr()

        assert out == """\
Exemple #9
0
    def test_redir_init_slash(self):
        filename = os.path.join(self.root_dir, 'redir-slash.warc.gz')
        with open(filename, 'wb') as fh:
            self.writer = WARCWriter(fh, gzip=True)

            response = self.create_response_record('https://www.example.com/sub/path/', '201806026101112', 'Sub Path Data')

            response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '201806026101112', 'Sub Path Data Q')

        wb_manager(['add', 'redir', filename])
    def test_acl_list(self, capsys):
        wb_manager(['acl', 'list', self.acl_filename])

        out, err = capsys.readouterr()

        assert out == """\
Rules for %s from %s:

com,example, - {"access": "exclude", "url": "com,example,"}
com,example)/ - {"access": "allow", "url": "http://example.com/"}

""" % (self.acl_filename, self.acl_filename)
    def test_validate_and_sort_acl(self):
        with open(self.acl_filename, 'at') as fh:
            fh.write('com,example)/subpath - {"access": "block", "url": "http://example.com/subpath"}\n')

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
com,example)/ - {"access": "allow", "url": "http://example.com/"}
com,example)/subpath - {"access": "block", "url": "http://example.com/subpath"}
"""

        wb_manager(['acl', 'validate', self.acl_filename])

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
def add_collection(*, folder):
    arc_folder = Path(CONFIG.arc_source_directory).absolute() / folder
    if not os.path.exists(arc_folder):
        raise ValueError

    wdir = Path(CONFIG.working_directory).absolute()

    do_add = True
    with cd(wdir):
        try:
            wb_manager(["init", folder])
        except FileExistsError:
            do_add = False

        if do_add:
            data = {c: [] for c in CollectionTracker.COLUMNS}
            paths = []
            for arc_path in find_arcs(arc_folder):
                paths.append(arc_path)
            wb_manager(["add", folder, *paths])
            with open(wdir / "collections" / folder / "indexes" / "index.cdxj") as index_file:
                for line in index_file:
                    surt, ts, js = line.split(maxsplit=2)
                    js = json.loads(js)
                    reg = r"^https?:\/\/[\w.]+no(:\d+)?\/?$"
                    mime = js.get("mime", "")
                    status = js.get("status", "")
                    url = js.get("url", "")
                    fn = js.get("filename", "")
                    digest = js.get("digest", "")
                    if mime == "text/html" and status == "200" and re.search(reg, url):
                        line = (fn, ts, url, Verdict.UNDECIDED.value, digest, "")
                        for c, val in zip(CollectionTracker.COLUMNS, line):
                            data[c].append(val)
    df = pd.DataFrame(data)
    collection_state[folder] = CollectionTracker(folder, df)
    def test_importtxt_acl(self, capsys):
        name = os.path.join(self.root_dir, 'excludes.txt')
        with open(name, 'wt') as exc:
            exc.write('http://iana.org/\n')
            exc.write('http://example.com/subpath/another\n')
            exc.write('http://example.co/foo/\n')
            exc.write('http://example.com/\n')

        wb_manager(['acl', 'importtxt', self.acl_filename, name, 'exclude'])

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
org,iana)/ - {"access": "exclude", "url": "http://iana.org/"}
com,example)/subpath/another - {"access": "exclude", "url": "http://example.com/subpath/another"}
com,example)/subpath - {"access": "block", "url": "http://example.com/subpath"}
com,example)/ - {"access": "exclude", "url": "http://example.com/"}
co,example)/foo - {"access": "exclude", "url": "http://example.co/foo/"}
"""

        out, err = capsys.readouterr()

        assert 'Added or replaced 4 rules from {0}'.format(name) in out, out

        os.remove(name)
    def test_remove_acl_exact(self):
        wb_manager(['acl', 'remove', '-e', self.acl_filename, 'https://example.com/'])

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
 def test_acl_list_err_no_such_file(self):
     with raises(SystemExit):
         wb_manager(['acl', 'list', self.acl_filename + '2'])
    def test_acl_add_surt(self):
        wb_manager(['acl', 'add', self.acl_filename, 'com,example,', 'exclude'])

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
    def test_acl_add(self):
        wb_manager(['acl', 'add', self.acl_filename, 'http://example.com/', 'allow'])

        with open(self.acl_filename, 'rt') as fh:
            assert fh.read() == """\
 def test_acl_add_err_wrong_access(self):
     with raises(SystemExit):
         wb_manager(['acl', 'add', self.acl_filename, 'http://example.com/', 'access'])