Beispiel #1
0
def check_crawl_autoaddtext(gz, ind, topurl, outd):
    ds = create(outd)
    ds.run_procedure("cfg_text2git")
    with chpwd(outd):  # TODO -- dataset argument
        template_kwargs = {
            'url': topurl,
            'a_href_match_': '.*',
        }
        if gz:
            template_kwargs['archives_re'] = "\.gz$"
        crawl_init(template_kwargs, save=True, template='simple_with_archives')
        try:
            crawl()
        except MissingExternalDependency as exc:
            raise SkipTest(exc_str(exc))
    ok_clean_git(outd)
    ok_file_under_git(outd, "anothertext", annexed=False)
    ok_file_under_git(outd, "d/textfile", annexed=False)
    ok_file_under_git(outd, "d/tooshort", annexed=True)

    if 'compressed.dat.gz' in TEST_TREE2:
        if gz:
            ok_file_under_git(outd, "compressed.dat", annexed=False)
            ok_file_has_content(op.join(outd, "compressed.dat"),
                                u"мама мыла раму")
        else:
            ok_file_under_git(outd, "compressed.dat.gz", annexed=True)
    else:
        raise SkipTest(
            "Need datalad >= 0.11.2 to test .gz files decompression")
Beispiel #2
0
def test_obscure_names(path):
    bucket = "datalad-test2-obscurenames-versioned"
    get_test_providers('s3://' + bucket)  # to verify having s3 credentials
    create(path)
    with externals_use_cassette('test_simple_s3_test2_obscurenames_versioned_crawl_ext'), \
         chpwd(path):
        crawl_init(template="simple_s3", args=dict(bucket=bucket), save=True)
        crawl()
    # fun with unicode was postponed
    ok_clean_git(path, annex=True)
    for f in ['f &$=@:+,?;', "f!-_.*'( )", 'f 1', 'f [1][2]']:
        ok_file_under_git(path, f, annexed=True)
def test_crawl_autoaddtext(ind, topurl, outd):
    ds = create(outd, text_no_annex=True)
    with chpwd(outd):  # TODO -- dataset argument
        crawl_init(
            {'url': topurl, 'a_href_match_': '.*'}
            , save=True
            , template='simple_with_archives')
        crawl()
    ok_clean_git(outd)
    ok_file_under_git(outd, "anothertext", annexed=False)
    ok_file_under_git(outd, "d/textfile", annexed=False)
    ok_file_under_git(outd, "d/tooshort", annexed=True)
Beispiel #4
0
def _test_crawl_init_error_patch(return_value, exc, exc_msg, d):

    ar = AnnexRepo(d, create=True)
    with patch('datalad_crawler.crawl_init.load_pipeline_from_template',
               return_value=lambda dataset: return_value) as cm:
        with chpwd(d):
            with assert_raises(exc) as cm2:
                crawl_init(args=['dataset=Baltimore'], template='openfmri')
            assert_in(exc_msg, str(cm2.exception))

            cm.assert_called_with('openfmri',
                                  None,
                                  return_only=True,
                                  kwargs=OrderedDict([('dataset', 'Baltimore')
                                                      ]))
Beispiel #5
0
def _test_crawl_init(args, template, template_func, save, target_value,
                     tmpdir):
    ar = AnnexRepo(tmpdir, create=True)
    with chpwd(tmpdir):
        crawl_init(args=args,
                   template=template,
                   template_func=template_func,
                   save=save)
        eq_(exists(CRAWLER_META_DIR), True)
        eq_(exists(CRAWLER_META_CONFIG_PATH), True)
        f = open(CRAWLER_META_CONFIG_PATH, 'r')
        contents = f.read()
        eq_(contents, target_value)
        if save:
            ds = Dataset(tmpdir)
            ok_clean_git(tmpdir, annex=isinstance(ds.repo, AnnexRepo))
Beispiel #6
0
def test_crawl(tempd):
    if not _get_github_cred().is_known:
        raise SkipTest("no github credential")
    ds = create(tempd)
    with chpwd(tempd):
        crawl_init(template='gh',
                   save=True,
                   args={
                       'org': 'datalad-collection-1',
                       'include': 'kaggle'
                   })
        crawl()
    subdss = ds.subdatasets(fulfilled=True, result_xfm='datasets')
    assert all('kaggle' in d.path for d in subdss)
    assert_greater(len(subdss), 1)
    assert_false(ds.repo.dirty)
Beispiel #7
0
def _test_drop(path, drop_immediately):
    s3url = 's3://datalad-test0-nonversioned'
    providers = get_test_providers(s3url)  # to verify having s3 credentials
    # vcr tape is getting bound to the session object, so we need to
    # force re-establishing the session for the bucket.
    # TODO (in datalad): make a dedicated API for that, now too obscure
    _ = providers.get_status(s3url, allow_old_session=False)
    create(path)
    # unfortunately this doesn't work without force dropping since I guess vcr
    # stops and then gets queried again for the same tape while testing for
    # drop :-/
    with chpwd(path):
        crawl_init(
            template="simple_s3",
            args=dict(
                bucket="datalad-test0-nonversioned",
                drop=True,
                drop_force=True,  # so test goes faster
                drop_immediately=drop_immediately,
            ),
            save=True)
    if drop_immediately:
        # cannot figure out but taping that interaction results in
        # git annex addurl  error.  No time to figure it out
        # so we just crawl without vcr for now. TODO: figure out WTF
        with chpwd(path):
            crawl()
    else:
        with externals_use_cassette(
                'test_simple_s3_test0_nonversioned_crawl_ext'
                + ('_immediately' if drop_immediately else '')), \
                chpwd(path):
            crawl()
    # test that all was dropped
    repo = AnnexRepo(path, create=False)
    files = glob(_path_(path, '*'))
    eq_(len(files), 8)
    for f in files:
        assert_false(repo.file_has_content(f))
Beispiel #8
0
def test_drop(path):
    get_test_providers('s3://datalad-test0-nonversioned')  # to verify having s3 credentials
    create(path)
    # unfortunately this doesn't work without force dropping since I guess vcr
    # stops and then gets queried again for the same tape while testing for
    # drop :-/
    with externals_use_cassette('test_simple_s3_test0_nonversioned_crawl_ext'), \
         chpwd(path):
        crawl_init(template="simple_s3",
                   args=dict(
                       bucket="datalad-test0-nonversioned",
                       drop=True,
                       drop_force=True  # so test goes faster
                   ),
                   save=True
                   )
        crawl()
    # test that all was dropped
    repo = AnnexRepo(path, create=False)
    files = glob(_path_(path, '*'))
    eq_(len(files), 8)
    for f in files:
        assert_false(repo.file_has_content(f))