Ejemplo n.º 1
0
def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(
        internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date)
    )

    guid_results = dict()
    all_results = dict()

    c = {"s3": {"access": settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret": settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == "test_collection":
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (
                link.guid,
                truncatechars(link.submitted_title, 50),
            )
            guid_results["mediatype"] = item.metadata["mediatype"] == "web"
            guid_results["description"] = item.metadata["description"] == "Perma.cc archive of %s created on %s." % (
                link.submitted_url,
                link.creation_timestamp,
            )
            guid_results["contributor"] = item.metadata["contributor"] == "Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"] == link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"] == "http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"] == "urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (
                    link.organization,
                    link.organization.registrar,
                )

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results
Ejemplo n.º 2
0
def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open("ia_test.ini", "w") as fh:
        fh.write(test_conf)
    s = get_session(config_file="ia_test.ini")
    assert s.access_key == "key2"
Ejemplo n.º 3
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        s = get_session(config={"s3": {"access": "key"}})
        files = get_files("nasa", files="nasa_meta.xml", archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", config={"logging": {"level": "INFO"}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        test_conf = """[s3]\naccess = key2"""
        with open("ia_test.ini", "w") as fh:
            fh.write(test_conf)
        files = get_files("nasa", files="nasa_meta.xml", config_file="ia_test.ini")
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", http_adapter_kwargs={"max_retries": 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", request_kwargs={"timeout": 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"
Ejemplo n.º 4
0
def get_ia_session():
    try:
        assert config.has_key('s3/access_key')
        assert config.has_key('s3/secret_key')
        assert config.has_key('cookie')
        assert config.has_key('email')
    except:
        return None

    ia_session = get_session(
        config={
            'general': {
                'secure': True
            },
            's3': {
                'access': config.get('s3/access_key'),
                'secret': config.get('s3/secret_key')
            },
            'cookies': {
                'logged-in-user': config.get('email'),
                'logged-in-sig': config.get('cookie')
            },
        },
        http_adapter_kwargs={'max_retries': 10},
    )
    return ia_session
Ejemplo n.º 5
0
    def getStats():
        today = date.today()
        back30_days = (datetime.now() - timedelta(days=30)).date()
        back7_days = (datetime.now() - timedelta(days=7)).date()

        collection = 'greekgovernmentgazette'
        query_all = f'collection:({collection})'
        res = {}

        try:
            s = get_session()
            s.mount_http_adapter()

            search_results = s.search_items(query_all,
                                            fields=['identifier', 'addeddate'])
            lst_res = list(search_results)
            docs_last30days = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) >= back30_days
            ]
            docs_last7days = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) >= back7_days
            ]
            docs_today = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) == today
            ]

            res['count_all'] = len(lst_res)
            res['count_last30days'] = len(docs_last30days)
            res['count_last7days'] = len(docs_last7days)
            res['count_today'] = len(docs_today)
        finally:
            return res
    def __init__(self,
                 archive_id,
                 metadata=None,
                 config_file_path=None,
                 repo_base=None):
        """
        
        :param archive_id: 
        :param config_file_path:
        :param repo_base: In archive item, place each file in a folder mirroring its local location.
        """
        self.repo_base = repo_base
        self.archive_id = archive_id
        self.archive_session = internetarchive.get_session(
            config_file=config_file_path)
        self.archive_item = internetarchive.get_item(
            archive_id, config_file=config_file_path)
        self.metadata = metadata
        logging.info(self.archive_item.identifier)

        self.original_item_files = list(
            filter(
                lambda x: x["source"] == "original" and not x["name"].
                startswith(self.archive_item.identifier) and not x[
                    "name"].startswith("_"), self.archive_item.files))
        self.original_item_file_names = sorted(
            map(lambda x: x["name"], self.original_item_files))
Ejemplo n.º 7
0
def create_subcollection(collection_id, metadata=None, parent_collection=None):
    """
    The expected sub-collection hierarchy is as follows top-level OSF collection -> provider
    collection -> collection for nodes with multiple children -> all only child nodes

    :param metadata: dict should attributes for the provider's sub-collection is being created
    :param parent_collection: str the name of the  sub-collection's parent
    :return:
    """
    if metadata is None:
        metadata = {}

    session = internetarchive.get_session(
        config={
            "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY},
        },
    )

    collection = internetarchive.Item(session, collection_id)
    collection.upload(
        files={"dummy.txt": BytesIO(b"dummy")},
        metadata={
            "mediatype": "collection",
            "collection": parent_collection,
            **metadata,
        },
    )
Ejemplo n.º 8
0
def get_ia_item(guid):
    session = internetarchive.get_session(
        config={
            "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY},
        },
    )
    return session.get_item(guid)
Ejemplo n.º 9
0
def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)
    s = get_session(config_file='ia_test.ini')
    assert s.access_key == 'key2'
Ejemplo n.º 10
0
def stream_from_pbox(itemname, filename):
    # TODO:  handle errors etc
    archive_session = get_session(config_file=settings.IATOOL_CONFIG_PATH)
    item = archive_session.get_item(itemname)
    files = item.get_files(filename)
    file = files.__next__()
    return file.download(return_responses=True)
Ejemplo n.º 11
0
def stream_from_pbox(itemname, filename):
    # TODO:  handle errors etc
    archive_session = get_session(config_file=settings.IATOOL_CONFIG_PATH)
    item = archive_session.get_item(itemname)
    files = item.get_files(filename)
    file = files.__next__()
    return file.download(return_responses=True)
Ejemplo n.º 12
0
def new_session():
    global SESSION

    if SESSION is not None:
        raise Exception('have another session!')

    SESSION = get_session(config=CONFIG)
Ejemplo n.º 13
0
def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)
    s = get_session(config_file='ia_test.ini')
    assert s.access_key == 'key2'
Ejemplo n.º 14
0
def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date   = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date))

    guid_results = dict()
    all_results = dict()

    c = {"s3":{"access":settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret":settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == 'test_collection':
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (link.guid, truncatechars(link.submitted_title, 50))
            guid_results["mediatype"] = item.metadata["mediatype"]=="web"
            guid_results["description"] = item.metadata["description"]=="Perma.cc archive of %s created on %s." % (link.submitted_url, link.creation_timestamp,)
            guid_results["contributor"] = item.metadata["contributor"]=="Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"]==link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"]=="http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"]=="urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (link.organization, link.organization.registrar)

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results
Ejemplo n.º 15
0
 def createSession(self):
     iaKey = decryptEnvVar('IA_ACCESS_KEY')
     iaSecret = decryptEnvVar('IA_SECRET_KEY')
     return get_session(
         config={'s3': {
             'access': iaKey,
             'secret': iaSecret
         }})
Ejemplo n.º 16
0
 def start_ia_session(self):
     """ starts an internet archive session """
     config = dict(s3=dict(acccess=settings.INTERNET_ARCHIVE_ACCESS_KEY,
                           secret=settings.INTERNET_ARCHIVE_SECRET_KEY))
     s = get_session(config=config, debug=True)
     s.access_key = settings.INTERNET_ARCHIVE_ACCESS_KEY
     s.secret_key = settings.INTERNET_ARCHIVE_SECRET_KEY
     return s
Ejemplo n.º 17
0
def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key3'}})
        item = get_item('nasa', archive_session=s)
        assert item.session.access_key == 'key3'
Ejemplo n.º 18
0
def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key3'}})
        item = get_item('nasa', archive_session=s)
        assert item.session.access_key == 'key3'
Ejemplo n.º 19
0
def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={"general": {"secure": True}})
        rsps.add_metadata_mock("nasa")
        item = s.get_item("nasa")
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == "https://s3.us.archive.org/nasa/nasa.json"
Ejemplo n.º 20
0
 def start_ia_session(self):
     """ starts an internet archive session """
     config = dict(s3=dict(acccess=settings.INTERNET_ARCHIVE_ACCESS_KEY,
                           secret=settings.INTERNET_ARCHIVE_SECRET_KEY))
     s = get_session(config=config,
                     debug=True)
     s.access_key = settings.INTERNET_ARCHIVE_ACCESS_KEY
     s.secret_key = settings.INTERNET_ARCHIVE_SECRET_KEY
     return s
Ejemplo n.º 21
0
def upload_to_ia(force=set()):
    s = get_session()
    item = s.get_item("NotoFonts")
    hashdict = {f["name"]: f["md5"] for f in item.files}

    fonts_modified = False
    for path in tqdm(sorted(pathset)):
        filename = path.name
        file = open(path, "rb").read()
        hash = md5(file).hexdigest()
        if "fonts" not in force:
            try:
                if hashdict[filename] == hash:
                    print("SKIPPING: " + filename)
                    continue
            except KeyError:
                pass
        fonts_modified = True
        print("WORKING: " + filename)
        upload_paths = []
        ttf = TTFont(path)
        print("  CONVERTING TO woff2...")
        ttf.flavor = "woff2"
        woff2_path = "upload/" + path.with_suffix(".woff2").name
        try:
            ttf.save(open(woff2_path, "wb"))
            upload_paths.append(woff2_path)
        except TTLibError:
            print("could not convert to woff2")
        print("  CONVERTING TO woff...")
        ttf.flavor = "woff"
        woff_path = "upload/" + path.with_suffix(".woff").name
        ttf.save(open(woff_path, "wb"))
        upload_paths.append(woff_path)
        print("  UPLOADING...")
        r = item.upload(files=[*upload_paths, str(path)], retries=100)
        for upath in [woff2_path, woff_path]:
            remove(upath)
    if "css" in force or fonts_modified:
        from generate_css import build_all_css

        print("  GENERATING CSS...")
        build_all_css()
        css_files = glob("*.css")
        for path in [Path(p) for p in sorted(css_files)]:
            filename = path.name
            file = open(path, "rb").read()
            hash = md5(file).hexdigest()
            # if "css" not in force:
            try:
                if hashdict[filename] == hash:
                    print("SKIPPING: " + filename)
                    continue
            except KeyError:
                pass
            print("  UPLOADING " + filename)
            r = item.upload(files=css_files, retries=100)
Ejemplo n.º 22
0
def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add_metadata_mock('nasa')
        item = s.get_item('nasa')
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json'
Ejemplo n.º 23
0
def test_get_session_with_config():
    s = get_session(config={
        's3': {
            'access': 'key'
        },
        'gengeral': {
            'secure': False
        }
    })
    assert s.access_key == 'key'
Ejemplo n.º 24
0
 def __init__(
     self,
     issue_db: IssueDB,
     sandcrawler_db_client: SandcrawlerPostgrestClient,
     sandcrawler_s3_client: SandcrawlerMinioClient,
 ):
     self.issue_db: IssueDB = issue_db
     self.ia_client = internetarchive.get_session()
     self.sandcrawler_db_client = sandcrawler_db_client
     self.sandcrawler_s3_client = sandcrawler_s3_client
Ejemplo n.º 25
0
def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        c = {'s3': {'access': 'foo', 'secret': 'bar'}, 'general': {'secure': True}}
        s = get_session(config=c)
        rsps.add_metadata_mock('nasa')
        item = s.get_item('nasa')
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json'
Ejemplo n.º 26
0
def get_stats(days=30):
    back_days = (datetime.now() - timedelta(days=days)).date()
    query_all = 'collection:(greekgovernmentgazette) AND date:[{1} TO {0}]' \
                    .format(datetime.now().strftime('%Y-%m-%d'), back_days.strftime('%Y-%m-%d'))
    s = get_session()
    s.mount_http_adapter()

    search_results = s.search_items(query_all,
                                    fields=['identifier', 'addeddate'])

    return '{}\t{}'.format(query_all, len(search_results))
Ejemplo n.º 27
0
def test_upload_secure_session(testitem_metadata, json_filename):
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add(responses.GET, 'https://archive.org/metadata/nasa',
                 body=testitem_metadata,
                 status=200)
        item = s.get_item('nasa')
        with responses.RequestsMock(
                assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE, status=200)
            r = item.upload(json_filename)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa_meta.json'
Ejemplo n.º 28
0
def test_upload_secure_session(testitem_metadata, json_filename):
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add(responses.GET,
                 'https://archive.org/metadata/nasa',
                 body=testitem_metadata,
                 status=200)
        item = s.get_item('nasa')
        with responses.RequestsMock(
                assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE, status=200)
            r = item.upload(json_filename)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa_meta.json'
Ejemplo n.º 29
0
    def __init__(self, file_storage, access_key, secret_key, loglevel, logfile):
        self.file_storage = file_storage
        self.access_key   = access_key
        self.secret_key   = secret_key

        session_data = {'access': access_key, 'secret': secret_key}
        if logfile:
            logconfig    = {'logging': {'level': loglevel, 'file': logfile}}
        else:    
            logconfig    = {'logging': {'level': loglevel}}

        self.session = get_session({'s3': session_data, 'logging': logconfig})
        self.logger = logging.getLogger('iasync')
Ejemplo n.º 30
0
    def __init__(self, top_dir, access_key, secret_key, loglevel, logfile):
        self.top_dir = top_dir
        self.access_key = access_key
        self.secret_key = secret_key

        session_data = {'access': access_key, 'secret': secret_key}
        if logfile:
            logconfig = {'logging': {'level': loglevel, 'file': logfile}}
        else:
            logconfig = {'logging': {'level': loglevel}}

        self.session = get_session({'s3': session_data, 'logging': logconfig})
        self.logger = logging.getLogger('gvision.ia')
Ejemplo n.º 31
0
Archivo: ia.py Proyecto: uvalib/emma-ia
def ia_get_session() -> ArchiveSession:
    """
    Get an IA session based on the configuration supplied via environment
    variables.

    Because get_session() starts with values found in ~/.ia for the current
    user then merges in the "additional" supplied values, the configuration
    file reference is explicitly eliminated. For desktop testing, this
    guarantees that the application has the same dependence on environment
    variables as it would when deployed.

    """
    return internetarchive.get_session(IA_CONFIG, config_file='/dev/null')
Ejemplo n.º 32
0
def main(collection: str, name: str, concurrency: int, dummy: bool=False,
         dummy_text: str=None):
    if dummy:
        t = DummyTrainer(dummy_text)
    else:
        urls = from_cdx_url(cdx_url(collection), session=internetarchive.get_session())
        t = Trainer(urls, concurrency=concurrency)
    identifier = str(int(time.time()))
    filename_base = collection + '_dictionary_' + identifier
    upload_urls = t.upload(filename_base + '.zstdict.zst', filename_base)
    print(upload_urls)
    add_entry(identifier, name, t.sha256, upload_urls['public_url'],
              upload_urls['backup_url'])
Ejemplo n.º 33
0
 def __init__(self,
              ckm_repo: CkanMetaRepo,
              ia_access: str,
              ia_secret: str,
              ia_collection: str,
              token: str = None) -> None:
     self.ckm_repo = ckm_repo
     self.ia_collection = ia_collection
     self.ia_access = ia_access
     self.ia_session = internetarchive.get_session(
         config={'s3': {
             'access': ia_access,
             'secret': ia_secret,
         }})
     self._gh = github.Github(token) if token else github.Github()
Ejemplo n.º 34
0
def get_dictionary(filename: str) -> zstandard.ZstdCompressionDict:
    s = internetarchive.get_session()
    r = s.get('https://archive.org/download/' + filename,
              headers={'Range': 'bytes=0-7'})
    if r.content[:4] != b'\x5D\x2A\x4D\x18':
        return None
    data_size = struct.unpack('<L', r.content[4:])[0]
    r = s.get('https://archive.org/download/' + filename,
              headers={'Range': 'bytes=8-{}'.format(8 + data_size - 1)})
    dictionary = r.content
    if r.content[:4] == b'\x28\xB5\x2F\xFD':
        dictionary = zstandard.ZstdDecompressor().decompress(dictionary)
    if dictionary[:4] != b'\x37\xA4\x30\xEC':
        raise ValueError('Not a dictionary.')
    return zstandard.ZstdCompressionDict(dictionary)
def main():
    session = internetarchive.get_session()
    if len(sys.argv) == 3:
        item_name = sys.argv[1]
        release_id = sys.argv[2]
        item_to_fileset(item_name, release_id=release_id, session=session)
    else:
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            fields = line.split("\t")
            assert len(fields) == 2
            item_name = fields[0]
            release_id = fields[1]
            item_to_fileset(item_name, release_id=release_id, session=session)
Ejemplo n.º 36
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {
                              'level': 'INFO'
                          }})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'
Ejemplo n.º 37
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {
                              'level': 'INFO'
                          }})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'
Ejemplo n.º 38
0
def cdx_url(collection: str) -> str:
    session = internetarchive.get_session()
    response = session.get(
        'https://archive.org/advancedsearch.php',
        params={
            'q': (
                'collection:archiveteam* '
                'AND format:(Item CDX Index) '
                'AND identifier:{}*'.format(collection)
            ),
            'fl[]': 'identifier',
            'sort[]': 'addeddate desc',
            'rows': '1',
            'output': 'json',
            'scope': 'all'
        }
    ).json()
    print(response)
    identifier = response['response']['docs'][0]['identifier']
    return 'https://archive.org/download/{0}/{0}.cdx.gz'.format(identifier)
Ejemplo n.º 39
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {'level': 'INFO'}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa', files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa', files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'
    def __init__(self, **kwargs):
        super().__init__()
        self.ingest_strategy = IngestStrategy.ArchiveorgFileset

        # TODO: enable cleanup when confident (eg, safe path parsing)
        self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files",
                                                   True)
        self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR",
                                          "/tmp/sandcrawler/")
        try:
            os.mkdir(self.working_dir)
        except FileExistsError:
            pass

        self.http_session = requests_retry_session()
        self.ia_session = internetarchive.get_session(
            config={
                "s3": {
                    "access": os.environ.get("IA_ACCESS_KEY"),
                    "secret": os.environ.get("IA_SECRET_KEY"),
                },
            })
Ejemplo n.º 41
0
def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {'level': 'INFO'}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa', files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa', files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'
Ejemplo n.º 42
0
def archive_url(data: typing.Dict[str, bytes],
                url_data: typing.Tuple[str, int, int, str, bool]):
    url, offset, length, filename, redownload = url_data
    if redownload:
        data[url] = requests.get(url, allow_redirects=False).content
    else:
        if filename.endswith('.zst'):
            with dictionary_lock:
                dictionary = get_dictionary(filename)
        r = internetarchive.get_session().get(
            'https://archive.org/download/' + filename,
            headers={
                'Range': 'bytes={}-{}'.format(offset, offset + length - 1)
            })
        if filename.endswith('.zst'):
            data[url] = zstandard.ZstdDecompressor(dict_data=dictionary) \
                .decompressobj().decompress(r.content)
        elif filename.endswith('.gz'):
            data[url] = gzip.decompress(r.content)
        elif filename.endswith('.warc'):
            data[url] = r.content
        else:
            raise ValueError('WARC type not supported.')
    print(len(data[url]), url)
Ejemplo n.º 43
0
        logger.warning("Timeout or unknown RequestException. Unable to upload "
                       "to IA. Trying again if retries not exceeded: %s" % rd)
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    if all(r.ok for r in responses):
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        rd.save(do_extraction=False, index=False)


access_key = settings.IA_ACCESS_KEY
secret_key = settings.IA_SECRET_KEY
session = ia.get_session({'s3': {
    'access': access_key,
    'secret': secret_key,
}})


def upload_to_ia(identifier, files, metadata=None):
    """Upload an item and its files to the Internet Archive

    On the Internet Archive there are Items and files. Items have a global
    identifier, and files go inside the item:

        https://internetarchive.readthedocs.io/en/latest/items.html

    This function mirrors the IA library's similar upload function, but builds
    in retries and various assumptions that make sense. Note that according to
    emails with IA staff, it is best to maximize the number of files uploaded to
    an Item at a time, rather than uploading each file in a separate go.
Ejemplo n.º 44
0
def test_get_session_with_config():
    s = get_session(config={'s3': {'access': 'key'}, 'gengeral': {'secure': False}})
    assert s.access_key == 'key'
Ejemplo n.º 45
0
def test_get_session_with_config():
    s = get_session(config={'s3': {'access': 'key'}})
    assert s.access_key == 'key'
Ejemplo n.º 46
0
def test_get_session_with_config():
    s = get_session(config={"s3": {"access": "key"}})
    assert s.access_key == "key"
Ejemplo n.º 47
0
inc_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, inc_path)
from copy import deepcopy

import responses

from internetarchive.cli import ia_list
from internetarchive import get_session


protocol = 'https:'


ROOT_DIR = os.getcwd()
TEST_JSON_FILE = os.path.join(ROOT_DIR, 'tests/data/nasa_meta.json')
SESSION = get_session()
with open(TEST_JSON_FILE, 'r') as fh:
    ITEM_METADATA = fh.read().strip()

NASA_FILES = set([
    'NASAarchiveLogo.jpg',
    'globe_west_540.jpg',
    'nasa_reviews.xml',
    'nasa_meta.xml',
    'nasa_archive.torrent',
    'nasa_files.xml'
])


def test_ia_list(capsys):
    with responses.RequestsMock() as rsps:
Ejemplo n.º 48
0
def nasa_item():
    session = get_session()
    with IaRequestsMock() as mocker:
        mocker.add_metadata_mock('nasa')
        yield session.get_item('nasa')
Ejemplo n.º 49
0
def session():
    return get_session(config=dict(s3=dict(access='access', secret='secret')))
Ejemplo n.º 50
0
def session_with_logging():
    return get_session(config={'logging': {'level': 'INFO'}})
Ejemplo n.º 51
0
def session():
    return get_session()
Ejemplo n.º 52
0
def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        s = get_session(config={"s3": {"access": "key3"}})
        item = get_item("nasa", archive_session=s)
        assert item.session.access_key == "key3"
Ejemplo n.º 53
0
def test_get_item_with_archive_session(nasa_mocker):
    s = get_session(config={'s3': {'access': 'key3'}})
    item = get_item('nasa', archive_session=s)
    assert item.session.access_key == 'key3'