Ejemplos de get_session en Python, ejemplos de internetarchive.get_session en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: dev.py Proyecto: harvard-lil/perma

def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(
        internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date)
    )

    guid_results = dict()
    all_results = dict()

    c = {"s3": {"access": settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret": settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == "test_collection":
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (
                link.guid,
                truncatechars(link.submitted_title, 50),
            )
            guid_results["mediatype"] = item.metadata["mediatype"] == "web"
            guid_results["description"] = item.metadata["description"] == "Perma.cc archive of %s created on %s." % (
                link.submitted_url,
                link.creation_timestamp,
            )
            guid_results["contributor"] = item.metadata["contributor"] == "Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"] == link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"] == "http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"] == "urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (
                    link.organization,
                    link.organization.registrar,
                )

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_api.py Proyecto: h4ck3rm1k3/ia-wrapper

def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open("ia_test.ini", "w") as fh:
        fh.write(test_conf)
    s = get_session(config_file="ia_test.ini")
    assert s.access_key == "key2"

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_api.py Proyecto: h4ck3rm1k3/ia-wrapper

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        s = get_session(config={"s3": {"access": "key"}})
        files = get_files("nasa", files="nasa_meta.xml", archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", config={"logging": {"level": "INFO"}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        test_conf = """[s3]\naccess = key2"""
        with open("ia_test.ini", "w") as fh:
            fh.write(test_conf)
        files = get_files("nasa", files="nasa_meta.xml", config_file="ia_test.ini")
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", http_adapter_kwargs={"max_retries": 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

        files = get_files("nasa", files="nasa_meta.xml", request_kwargs={"timeout": 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == "nasa_meta.xml"

Ejemplo n.º 4

0

Mostrar archivo

def get_ia_session():
    try:
        assert config.has_key('s3/access_key')
        assert config.has_key('s3/secret_key')
        assert config.has_key('cookie')
        assert config.has_key('email')
    except:
        return None

    ia_session = get_session(
        config={
            'general': {
                'secure': True
            },
            's3': {
                'access': config.get('s3/access_key'),
                'secret': config.get('s3/secret_key')
            },
            'cookies': {
                'logged-in-user': config.get('email'),
                'logged-in-sig': config.get('cookie')
            },
        },
        http_adapter_kwargs={'max_retries': 10},
    )
    return ia_session

Ejemplo n.º 5

0

Mostrar archivo

    def getStats():
        today = date.today()
        back30_days = (datetime.now() - timedelta(days=30)).date()
        back7_days = (datetime.now() - timedelta(days=7)).date()

        collection = 'greekgovernmentgazette'
        query_all = f'collection:({collection})'
        res = {}

        try:
            s = get_session()
            s.mount_http_adapter()

            search_results = s.search_items(query_all,
                                            fields=['identifier', 'addeddate'])
            lst_res = list(search_results)
            docs_last30days = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) >= back30_days
            ]
            docs_last7days = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) >= back7_days
            ]
            docs_today = [
                i for i in lst_res
                if isodate.parse_date(i['addeddate']) == today
            ]

            res['count_all'] = len(lst_res)
            res['count_last30days'] = len(docs_last30days)
            res['count_last7days'] = len(docs_last7days)
            res['count_today'] = len(docs_today)
        finally:
            return res

Ejemplo n.º 6

0

Mostrar archivo

Archivo: archive_utility.py Proyecto: sanskrit-coders/curation_utils

    def __init__(self,
                 archive_id,
                 metadata=None,
                 config_file_path=None,
                 repo_base=None):
        """
        
        :param archive_id: 
        :param config_file_path:
        :param repo_base: In archive item, place each file in a folder mirroring its local location.
        """
        self.repo_base = repo_base
        self.archive_id = archive_id
        self.archive_session = internetarchive.get_session(
            config_file=config_file_path)
        self.archive_item = internetarchive.get_item(
            archive_id, config_file=config_file_path)
        self.metadata = metadata
        logging.info(self.archive_item.identifier)

        self.original_item_files = list(
            filter(
                lambda x: x["source"] == "original" and not x["name"].
                startswith(self.archive_item.identifier) and not x[
                    "name"].startswith("_"), self.archive_item.files))
        self.original_item_file_names = sorted(
            map(lambda x: x["name"], self.original_item_files))

Ejemplo n.º 7

0

Mostrar archivo

Archivo: pigeon.py Proyecto: Johnetordoff/osf-pigeon

def create_subcollection(collection_id, metadata=None, parent_collection=None):
    """
    The expected sub-collection hierarchy is as follows top-level OSF collection -> provider
    collection -> collection for nodes with multiple children -> all only child nodes

    :param metadata: dict should attributes for the provider's sub-collection is being created
    :param parent_collection: str the name of the  sub-collection's parent
    :return:
    """
    if metadata is None:
        metadata = {}

    session = internetarchive.get_session(
        config={
            "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY},
        },
    )

    collection = internetarchive.Item(session, collection_id)
    collection.upload(
        files={"dummy.txt": BytesIO(b"dummy")},
        metadata={
            "mediatype": "collection",
            "collection": parent_collection,
            **metadata,
        },
    )

Ejemplo n.º 8

0

Mostrar archivo

Archivo: pigeon.py Proyecto: Johnetordoff/osf-pigeon

def get_ia_item(guid):
    session = internetarchive.get_session(
        config={
            "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY},
        },
    )
    return session.get_item(guid)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_api.py Proyecto: jleclanche/internetarchive

def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)
    s = get_session(config_file='ia_test.ini')
    assert s.access_key == 'key2'

Ejemplo n.º 10

0

Mostrar archivo

Archivo: views.py Proyecto: machawk1/data-transfer-apis

def stream_from_pbox(itemname, filename):
    # TODO:  handle errors etc
    archive_session = get_session(config_file=settings.IATOOL_CONFIG_PATH)
    item = archive_session.get_item(itemname)
    files = item.get_files(filename)
    file = files.__next__()
    return file.download(return_responses=True)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: views.py Proyecto: WASAPI-Community/data-transfer-apis

def stream_from_pbox(itemname, filename):
    # TODO:  handle errors etc
    archive_session = get_session(config_file=settings.IATOOL_CONFIG_PATH)
    item = archive_session.get_item(itemname)
    files = item.get_files(filename)
    file = files.__next__()
    return file.download(return_responses=True)

Ejemplo n.º 12

0

Mostrar archivo

def new_session():
    global SESSION

    if SESSION is not None:
        raise Exception('have another session!')

    SESSION = get_session(config=CONFIG)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_api.py Proyecto: brycedrennan/internetarchive

def test_get_session_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)
    s = get_session(config_file='ia_test.ini')
    assert s.access_key == 'key2'

Ejemplo n.º 14

0

Mostrar archivo

def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date   = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date))

    guid_results = dict()
    all_results = dict()

    c = {"s3":{"access":settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret":settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == 'test_collection':
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (link.guid, truncatechars(link.submitted_title, 50))
            guid_results["mediatype"] = item.metadata["mediatype"]=="web"
            guid_results["description"] = item.metadata["description"]=="Perma.cc archive of %s created on %s." % (link.submitted_url, link.creation_timestamp,)
            guid_results["contributor"] = item.metadata["contributor"]=="Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"]==link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"]=="http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"]=="urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (link.organization, link.organization.registrar)

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results

Ejemplo n.º 15

0

Mostrar archivo

 def createSession(self):
     iaKey = decryptEnvVar('IA_ACCESS_KEY')
     iaSecret = decryptEnvVar('IA_SECRET_KEY')
     return get_session(
         config={'s3': {
             'access': iaKey,
             'secret': iaSecret
         }})

Ejemplo n.º 16

0

Mostrar archivo

Archivo: internetarchive.py Proyecto: rdhyee/open-context-py

 def start_ia_session(self):
     """ starts an internet archive session """
     config = dict(s3=dict(acccess=settings.INTERNET_ARCHIVE_ACCESS_KEY,
                           secret=settings.INTERNET_ARCHIVE_SECRET_KEY))
     s = get_session(config=config, debug=True)
     s.access_key = settings.INTERNET_ARCHIVE_ACCESS_KEY
     s.secret_key = settings.INTERNET_ARCHIVE_SECRET_KEY
     return s

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_api.py Proyecto: jleclanche/internetarchive

def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key3'}})
        item = get_item('nasa', archive_session=s)
        assert item.session.access_key == 'key3'

Ejemplo n.º 18

0

Mostrar archivo

Archivo: test_api.py Proyecto: bmschmidt/internetarchive

def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key3'}})
        item = get_item('nasa', archive_session=s)
        assert item.session.access_key == 'key3'

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_item.py Proyecto: jjjake/internetarchive

def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={"general": {"secure": True}})
        rsps.add_metadata_mock("nasa")
        item = s.get_item("nasa")
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == "https://s3.us.archive.org/nasa/nasa.json"

Ejemplo n.º 20

0

Mostrar archivo

Archivo: iairma.py Proyecto: ekansa/open-context-py

 def start_ia_session(self):
     """ starts an internet archive session """
     config = dict(s3=dict(acccess=settings.INTERNET_ARCHIVE_ACCESS_KEY,
                           secret=settings.INTERNET_ARCHIVE_SECRET_KEY))
     s = get_session(config=config,
                     debug=True)
     s.access_key = settings.INTERNET_ARCHIVE_ACCESS_KEY
     s.secret_key = settings.INTERNET_ARCHIVE_SECRET_KEY
     return s

Ejemplo n.º 21

0

Mostrar archivo

Archivo: ia_noto.py Proyecto: iwsfutcmd/ia-noto

def upload_to_ia(force=set()):
    s = get_session()
    item = s.get_item("NotoFonts")
    hashdict = {f["name"]: f["md5"] for f in item.files}

    fonts_modified = False
    for path in tqdm(sorted(pathset)):
        filename = path.name
        file = open(path, "rb").read()
        hash = md5(file).hexdigest()
        if "fonts" not in force:
            try:
                if hashdict[filename] == hash:
                    print("SKIPPING: " + filename)
                    continue
            except KeyError:
                pass
        fonts_modified = True
        print("WORKING: " + filename)
        upload_paths = []
        ttf = TTFont(path)
        print("  CONVERTING TO woff2...")
        ttf.flavor = "woff2"
        woff2_path = "upload/" + path.with_suffix(".woff2").name
        try:
            ttf.save(open(woff2_path, "wb"))
            upload_paths.append(woff2_path)
        except TTLibError:
            print("could not convert to woff2")
        print("  CONVERTING TO woff...")
        ttf.flavor = "woff"
        woff_path = "upload/" + path.with_suffix(".woff").name
        ttf.save(open(woff_path, "wb"))
        upload_paths.append(woff_path)
        print("  UPLOADING...")
        r = item.upload(files=[*upload_paths, str(path)], retries=100)
        for upath in [woff2_path, woff_path]:
            remove(upath)
    if "css" in force or fonts_modified:
        from generate_css import build_all_css

        print("  GENERATING CSS...")
        build_all_css()
        css_files = glob("*.css")
        for path in [Path(p) for p in sorted(css_files)]:
            filename = path.name
            file = open(path, "rb").read()
            hash = md5(file).hexdigest()
            # if "css" not in force:
            try:
                if hashdict[filename] == hash:
                    print("SKIPPING: " + filename)
                    continue
            except KeyError:
                pass
            print("  UPLOADING " + filename)
            r = item.upload(files=css_files, retries=100)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: test_item.py Proyecto: brycedrennan/internetarchive

def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add_metadata_mock('nasa')
        item = s.get_item('nasa')
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json'

Ejemplo n.º 23

0

Mostrar archivo

def test_get_session_with_config():
    s = get_session(config={
        's3': {
            'access': 'key'
        },
        'gengeral': {
            'secure': False
        }
    })
    assert s.access_key == 'key'

Ejemplo n.º 24

0

Mostrar archivo

Archivo: work_pipeline.py Proyecto: gdamdam/fatcat-scholar

 def __init__(
     self,
     issue_db: IssueDB,
     sandcrawler_db_client: SandcrawlerPostgrestClient,
     sandcrawler_s3_client: SandcrawlerMinioClient,
 ):
     self.issue_db: IssueDB = issue_db
     self.ia_client = internetarchive.get_session()
     self.sandcrawler_db_client = sandcrawler_db_client
     self.sandcrawler_s3_client = sandcrawler_s3_client

Ejemplo n.º 25

0

Mostrar archivo

Archivo: test_item.py Proyecto: jjjake/internetarchive

def test_upload_secure_session():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        c = {'s3': {'access': 'foo', 'secret': 'bar'}, 'general': {'secure': True}}
        s = get_session(config=c)
        rsps.add_metadata_mock('nasa')
        item = s.get_item('nasa')
        with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE)
            r = item.upload(NASA_METADATA_PATH)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json'

Ejemplo n.º 26

0

Mostrar archivo

def get_stats(days=30):
    back_days = (datetime.now() - timedelta(days=days)).date()
    query_all = 'collection:(greekgovernmentgazette) AND date:[{1} TO {0}]' \
                    .format(datetime.now().strftime('%Y-%m-%d'), back_days.strftime('%Y-%m-%d'))
    s = get_session()
    s.mount_http_adapter()

    search_results = s.search_items(query_all,
                                    fields=['identifier', 'addeddate'])

    return '{}\t{}'.format(query_all, len(search_results))

Ejemplo n.º 27

0

Mostrar archivo

Archivo: test_item.py Proyecto: FactMiners/internetarchive

def test_upload_secure_session(testitem_metadata, json_filename):
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add(responses.GET, 'https://archive.org/metadata/nasa',
                 body=testitem_metadata,
                 status=200)
        item = s.get_item('nasa')
        with responses.RequestsMock(
                assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE, status=200)
            r = item.upload(json_filename)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa_meta.json'

Ejemplo n.º 28

0

Mostrar archivo

def test_upload_secure_session(testitem_metadata, json_filename):
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        s = get_session(config={'general': {'secure': True}})
        rsps.add(responses.GET,
                 'https://archive.org/metadata/nasa',
                 body=testitem_metadata,
                 status=200)
        item = s.get_item('nasa')
        with responses.RequestsMock(
                assert_all_requests_are_fired=False) as rsps:
            rsps.add(responses.PUT, S3_URL_RE, status=200)
            r = item.upload(json_filename)
            assert r[0].url == 'https://s3.us.archive.org/nasa/nasa_meta.json'

Ejemplo n.º 29

0

Mostrar archivo

Archivo: iasync.py Proyecto: omshivaprakash/egazette

    def __init__(self, file_storage, access_key, secret_key, loglevel, logfile):
        self.file_storage = file_storage
        self.access_key   = access_key
        self.secret_key   = secret_key

        session_data = {'access': access_key, 'secret': secret_key}
        if logfile:
            logconfig    = {'logging': {'level': loglevel, 'file': logfile}}
        else:    
            logconfig    = {'logging': {'level': loglevel}}

        self.session = get_session({'s3': session_data, 'logging': logconfig})
        self.logger = logging.getLogger('iasync')

Ejemplo n.º 30

0

Mostrar archivo

    def __init__(self, top_dir, access_key, secret_key, loglevel, logfile):
        self.top_dir = top_dir
        self.access_key = access_key
        self.secret_key = secret_key

        session_data = {'access': access_key, 'secret': secret_key}
        if logfile:
            logconfig = {'logging': {'level': loglevel, 'file': logfile}}
        else:
            logconfig = {'logging': {'level': loglevel}}

        self.session = get_session({'s3': session_data, 'logging': logconfig})
        self.logger = logging.getLogger('gvision.ia')

Ejemplo n.º 31

0

Mostrar archivo

Archivo: ia.py Proyecto: uvalib/emma-ia

def ia_get_session() -> ArchiveSession:
    """
    Get an IA session based on the configuration supplied via environment
    variables.

    Because get_session() starts with values found in ~/.ia for the current
    user then merges in the "additional" supplied values, the configuration
    file reference is explicitly eliminated. For desktop testing, this
    guarantees that the application has the same dependence on environment
    variables as it would when deployed.

    """
    return internetarchive.get_session(IA_CONFIG, config_file='/dev/null')

Ejemplo n.º 32

0

Mostrar archivo

def main(collection: str, name: str, concurrency: int, dummy: bool=False,
         dummy_text: str=None):
    if dummy:
        t = DummyTrainer(dummy_text)
    else:
        urls = from_cdx_url(cdx_url(collection), session=internetarchive.get_session())
        t = Trainer(urls, concurrency=concurrency)
    identifier = str(int(time.time()))
    filename_base = collection + '_dictionary_' + identifier
    upload_urls = t.upload(filename_base + '.zstdict.zst', filename_base)
    print(upload_urls)
    add_entry(identifier, name, t.sha256, upload_urls['public_url'],
              upload_urls['backup_url'])

Ejemplo n.º 33

0

Mostrar archivo

Archivo: mirrorer.py Proyecto: techman83/NetKAN-Infra

 def __init__(self,
              ckm_repo: CkanMetaRepo,
              ia_access: str,
              ia_secret: str,
              ia_collection: str,
              token: str = None) -> None:
     self.ckm_repo = ckm_repo
     self.ia_collection = ia_collection
     self.ia_access = ia_access
     self.ia_session = internetarchive.get_session(
         config={'s3': {
             'access': ia_access,
             'secret': ia_secret,
         }})
     self._gh = github.Github(token) if token else github.Github()

Ejemplo n.º 34

0

Mostrar archivo

def get_dictionary(filename: str) -> zstandard.ZstdCompressionDict:
    s = internetarchive.get_session()
    r = s.get('https://archive.org/download/' + filename,
              headers={'Range': 'bytes=0-7'})
    if r.content[:4] != b'\x5D\x2A\x4D\x18':
        return None
    data_size = struct.unpack('<L', r.content[4:])[0]
    r = s.get('https://archive.org/download/' + filename,
              headers={'Range': 'bytes=8-{}'.format(8 + data_size - 1)})
    dictionary = r.content
    if r.content[:4] == b'\x28\xB5\x2F\xFD':
        dictionary = zstandard.ZstdDecompressor().decompress(dictionary)
    if dictionary[:4] != b'\x37\xA4\x30\xEC':
        raise ValueError('Not a dictionary.')
    return zstandard.ZstdCompressionDict(dictionary)

Ejemplo n.º 35

0

Mostrar archivo

Archivo: archiveorg_fileset.py Proyecto: internetarchive/sandcrawler

def main():
    session = internetarchive.get_session()
    if len(sys.argv) == 3:
        item_name = sys.argv[1]
        release_id = sys.argv[2]
        item_to_fileset(item_name, release_id=release_id, session=session)
    else:
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            fields = line.split("\t")
            assert len(fields) == 2
            item_name = fields[0]
            release_id = fields[1]
            item_to_fileset(item_name, release_id=release_id, session=session)

Ejemplo n.º 36

0

Mostrar archivo

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {
                              'level': 'INFO'
                          }})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

Ejemplo n.º 37

0

Mostrar archivo

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {
                              'level': 'INFO'
                          }})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

Ejemplo n.º 38

0

Mostrar archivo

def cdx_url(collection: str) -> str:
    session = internetarchive.get_session()
    response = session.get(
        'https://archive.org/advancedsearch.php',
        params={
            'q': (
                'collection:archiveteam* '
                'AND format:(Item CDX Index) '
                'AND identifier:{}*'.format(collection)
            ),
            'fl[]': 'identifier',
            'sort[]': 'addeddate desc',
            'rows': '1',
            'output': 'json',
            'scope': 'all'
        }
    ).json()
    print(response)
    identifier = response['response']['docs'][0]['identifier']
    return 'https://archive.org/download/{0}/{0}.cdx.gz'.format(identifier)

Ejemplo n.º 39

0

Mostrar archivo

Archivo: test_api.py Proyecto: JesseWeinstein/internetarchive

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {'level': 'INFO'}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa', files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa', files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

Ejemplo n.º 40

0

Mostrar archivo

Archivo: fileset_strategies.py Proyecto: internetarchive/sandcrawler

    def __init__(self, **kwargs):
        super().__init__()
        self.ingest_strategy = IngestStrategy.ArchiveorgFileset

        # TODO: enable cleanup when confident (eg, safe path parsing)
        self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files",
                                                   True)
        self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR",
                                          "/tmp/sandcrawler/")
        try:
            os.mkdir(self.working_dir)
        except FileExistsError:
            pass

        self.http_session = requests_retry_session()
        self.ia_session = internetarchive.get_session(
            config={
                "s3": {
                    "access": os.environ.get("IA_ACCESS_KEY"),
                    "secret": os.environ.get("IA_SECRET_KEY"),
                },
            })

Ejemplo n.º 41

0

Mostrar archivo

Archivo: test_api.py Proyecto: jjjake/internetarchive

def test_get_files_with_get_item_kwargs(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        s = get_session(config={'s3': {'access': 'key'}})
        files = get_files('nasa', files='nasa_meta.xml', archive_session=s)
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          config={'logging': {'level': 'INFO'}})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        test_conf = """[s3]\naccess = key2"""
        with open('ia_test.ini', 'w') as fh:
            fh.write(test_conf)
        files = get_files('nasa', files='nasa_meta.xml',
                          config_file='ia_test.ini')
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa',
                          files='nasa_meta.xml',
                          http_adapter_kwargs={'max_retries': 3})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

        files = get_files('nasa', files='nasa_meta.xml',
                          request_kwargs={'timeout': 4})
        files = list(files)
        assert len(files) == 1
        assert files[0].name == 'nasa_meta.xml'

Ejemplo n.º 42

0

Mostrar archivo

def archive_url(data: typing.Dict[str, bytes],
                url_data: typing.Tuple[str, int, int, str, bool]):
    url, offset, length, filename, redownload = url_data
    if redownload:
        data[url] = requests.get(url, allow_redirects=False).content
    else:
        if filename.endswith('.zst'):
            with dictionary_lock:
                dictionary = get_dictionary(filename)
        r = internetarchive.get_session().get(
            'https://archive.org/download/' + filename,
            headers={
                'Range': 'bytes={}-{}'.format(offset, offset + length - 1)
            })
        if filename.endswith('.zst'):
            data[url] = zstandard.ZstdDecompressor(dict_data=dictionary) \
                .decompressobj().decompress(r.content)
        elif filename.endswith('.gz'):
            data[url] = gzip.decompress(r.content)
        elif filename.endswith('.warc'):
            data[url] = r.content
        else:
            raise ValueError('WARC type not supported.')
    print(len(data[url]), url)

Ejemplo n.º 43

0

Mostrar archivo

Archivo: tasks.py Proyecto: snorey/courtlistener

        logger.warning("Timeout or unknown RequestException. Unable to upload "
                       "to IA. Trying again if retries not exceeded: %s" % rd)
        if self.request.retries == self.max_retries:
            # Give up for now. It'll get done next time cron is run.
            return
        raise self.retry(exc=exc)
    if all(r.ok for r in responses):
        rd.filepath_ia = "https://archive.org/download/%s/%s" % (
            bucket_name, file_name)
        rd.save(do_extraction=False, index=False)


access_key = settings.IA_ACCESS_KEY
secret_key = settings.IA_SECRET_KEY
session = ia.get_session({'s3': {
    'access': access_key,
    'secret': secret_key,
}})


def upload_to_ia(identifier, files, metadata=None):
    """Upload an item and its files to the Internet Archive

    On the Internet Archive there are Items and files. Items have a global
    identifier, and files go inside the item:

        https://internetarchive.readthedocs.io/en/latest/items.html

    This function mirrors the IA library's similar upload function, but builds
    in retries and various assumptions that make sense. Note that according to
    emails with IA staff, it is best to maximize the number of files uploaded to
    an Item at a time, rather than uploading each file in a separate go.

Ejemplo n.º 44

0

Mostrar archivo

Archivo: test_api.py Proyecto: jjjake/internetarchive

def test_get_session_with_config():
    s = get_session(config={'s3': {'access': 'key'}, 'gengeral': {'secure': False}})
    assert s.access_key == 'key'

Ejemplo n.º 45

0

Mostrar archivo

Archivo: test_api.py Proyecto: jleclanche/internetarchive

def test_get_session_with_config():
    s = get_session(config={'s3': {'access': 'key'}})
    assert s.access_key == 'key'

Ejemplo n.º 46

0

Mostrar archivo

Archivo: test_api.py Proyecto: h4ck3rm1k3/ia-wrapper

def test_get_session_with_config():
    s = get_session(config={"s3": {"access": "key"}})
    assert s.access_key == "key"

Ejemplo n.º 47

0

Mostrar archivo

Archivo: test_ia_list.py Proyecto: FactMiners/internetarchive

inc_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, inc_path)
from copy import deepcopy

import responses

from internetarchive.cli import ia_list
from internetarchive import get_session


protocol = 'https:'


ROOT_DIR = os.getcwd()
TEST_JSON_FILE = os.path.join(ROOT_DIR, 'tests/data/nasa_meta.json')
SESSION = get_session()
with open(TEST_JSON_FILE, 'r') as fh:
    ITEM_METADATA = fh.read().strip()

NASA_FILES = set([
    'NASAarchiveLogo.jpg',
    'globe_west_540.jpg',
    'nasa_reviews.xml',
    'nasa_meta.xml',
    'nasa_archive.torrent',
    'nasa_files.xml'
])


def test_ia_list(capsys):
    with responses.RequestsMock() as rsps:

Ejemplo n.º 48

0

Mostrar archivo

Archivo: conftest.py Proyecto: jjjake/internetarchive

def nasa_item():
    session = get_session()
    with IaRequestsMock() as mocker:
        mocker.add_metadata_mock('nasa')
        yield session.get_item('nasa')

Ejemplo n.º 49

0

Mostrar archivo

Archivo: conftest.py Proyecto: jjjake/internetarchive

def session():
    return get_session(config=dict(s3=dict(access='access', secret='secret')))

Ejemplo n.º 50

0

Mostrar archivo

Archivo: conftest.py Proyecto: galgeek/internetarchive

def session_with_logging():
    return get_session(config={'logging': {'level': 'INFO'}})

Ejemplo n.º 51

0

Mostrar archivo

Archivo: conftest.py Proyecto: galgeek/internetarchive

def session():
    return get_session()

Ejemplo n.º 52

0

Mostrar archivo

Archivo: test_api.py Proyecto: h4ck3rm1k3/ia-wrapper

def test_get_item_with_archive_session():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        s = get_session(config={"s3": {"access": "key3"}})
        item = get_item("nasa", archive_session=s)
        assert item.session.access_key == "key3"

Ejemplo n.º 53

0

Mostrar archivo

Archivo: test_api.py Proyecto: jjjake/internetarchive

def test_get_item_with_archive_session(nasa_mocker):
    s = get_session(config={'s3': {'access': 'key3'}})
    item = get_item('nasa', archive_session=s)
    assert item.session.access_key == 'key3'