コード例 #1
0
ファイル: pigeon.py プロジェクト: Johnetordoff/osf-pigeon
def create_subcollection(collection_id, metadata=None, parent_collection=None):
    """
    The expected sub-collection hierarchy is as follows top-level OSF collection -> provider
    collection -> collection for nodes with multiple children -> all only child nodes

    :param metadata: dict should attributes for the provider's sub-collection is being created
    :param parent_collection: str the name of the  sub-collection's parent
    :return:
    """
    if metadata is None:
        metadata = {}

    session = internetarchive.get_session(
        config={
            "s3": {"access": settings.IA_ACCESS_KEY, "secret": settings.IA_SECRET_KEY},
        },
    )

    collection = internetarchive.Item(session, collection_id)
    collection.upload(
        files={"dummy.txt": BytesIO(b"dummy")},
        metadata={
            "mediatype": "collection",
            "collection": parent_collection,
            **metadata,
        },
    )
コード例 #2
0
ファイル: pedidos.py プロジェクト: okfn-brasil/esiclivre
def upload_attachment_to_internet_archive(pedido_protocol, filename):

    download_dir = flask.current_app.config['DOWNLOADS_PATH']
    downloaded_attachments = os.listdir(download_dir)

    if filename not in [a.decode('utf8') for a in downloaded_attachments]:
        logger.info("Arquivo {!r} não existe!.".format(filename))
        # TODO: O que fazer se o arquivo não estiver disponivel?
        # Já temos um caso onde o download não completa, mas por falha no
        # servidor do esic.
        return None
    else:

        # try:
        #     # get mediatype from file extension
        #     mediatype = filename.rpartition('.')[2]
        # except:
        #     mediatype = None

        item = internetarchive.Item('{prefix}_pedido_{protocol}'.format(
            prefix=flask.current_app.config['ATTACHMENT_URL_PREFIX'],
            protocol=pedido_protocol))
        metadata = dict(
            # mediatype=mediatype,
            # creator='OKF',
            created_at=arrow.now().isoformat())
        result = item.upload('{}/{}'.format(download_dir, filename),
                             metadata=metadata)

        if not result or result[0].status_code != 200:
            # TODO: O que fazer nessa situação?
            logger.info("Erro ao executar upload.")
        else:
            os.remove('{}/{}'.format(download_dir, filename))
コード例 #3
0
ファイル: mirrorer.py プロジェクト: Olympic1/NetKAN-Infra
 def try_mirror(self, ckan: CkanMirror) -> bool:
     if not ckan.can_mirror:
         # If we can't mirror, then we're done with this message
         logging.info('Ckan %s cannot be mirrored', ckan.mirror_item())
         return True
     if ckan.mirrored(self.ia_session):
         # If it's already mirrored, then we're done with this message
         logging.info('Ckan %s is already mirrored', ckan.mirror_item())
         return True
     download_file = ckan.open_download()
     if download_file:
         logging.info('Uploading %s', ckan.mirror_item())
         item = internetarchive.Item(self.ia_session, ckan.mirror_item())
         item.upload_file(download_file.name, ckan.mirror_filename(),
                          ckan.item_metadata, ckan.download_headers)
         source_url = ckan.source_download(self._default_branch(ckan))
         if source_url:
             with tempfile.NamedTemporaryFile() as tmp:
                 logging.info('Attempting to archive source from %s',
                              source_url)
                 download_stream_to_file(source_url, tmp)
                 tmp.flush()
                 item.upload_file(tmp.name, ckan.mirror_source_filename(),
                                  ckan.item_metadata,
                                  ckan.source_download_headers(tmp.name))
         return True
     logging.error("Failed to find or download %s", ckan.download)
     return False
コード例 #4
0
def mk_mirror(target):
    '''Make the mirror'''
    target = 'collection:' + target
    print("Attempting to download collection: " + target)
    search = ia.Search(target)

    ## Because the internetarchive module won't return us a list
    ## we'll have to make our own.
    current_item = 1
    total_item = 0
    collection = []
    for entry in search:
        collection.append(entry)
        total_item += 1

    ## Go through all items of the collection and download
    for entry in collection:
        item_id = entry['identifier']
        print('Downloading ' + str(current_item) + '/' + str(total_item) + '\t'\
         + item_id)

        item = ia.Item(item_id)
        status = item.download()
        print('\t\t Download successful')
        current_item += 1
コード例 #5
0
def get_new_item():
    """return an ia item object for an item that does not yet exist"""
    now = datetime.datetime.utcnow()
    item_name = 'test_upload_iawrapper_' + now.strftime('%Y_%m_%d_%H%M%S')
    item = ia.Item(item_name)
    if item.exists is False:
        return item

    raise KeyError, 'Could not find a unique item name after 5 tries'
コード例 #6
0
ファイル: test_item.py プロジェクト: wumpus/internetarchive
def test_download():
    item = internetarchive.Item('nasa')
    item_dir = item.identifier
    assert not os.path.exists(item_dir)
    item.download()
    assert os.path.exists(item_dir)
    assert os.path.exists(os.path.join(item_dir,
                                       item.identifier + '_meta.xml'))
    shutil.rmtree(item_dir)
コード例 #7
0
def download_item_files(item_id):
  # XXX Add a repeatable --format flag for this rather than hard coding
  # XXX Alternatively: yaml config file
  f = ["Comic Book RAR", "EPUB", "Animated GIF", "Text PDF", "Image Container PDF"]
  i = internetarchive.Item(item_id)
  verboseout("Downloading files from " + i.identifier)
  if args.verbose:
    i.download(concurrent=True, verbose=True, ignore_existing=True, formats=f)
  else:
    i.download(concurrent=True, ignore_existing=True, formats=f)
コード例 #8
0
ファイル: test_item.py プロジェクト: wumpus/internetarchive
def test_file():
    item = internetarchive.Item('nasa')
    filename = 'nasa_meta.xml'
    file = item.get_file(filename)

    assert not os.path.exists(filename)
    file.download()

    assert os.stat(filename).st_size == file.size
    os.unlink(filename)
コード例 #9
0
def test_file():
    item = internetarchive.Item('stairs')
    filename = 'glogo.png'
    file = item.file(filename)

    assert not os.path.exists(filename)
    file.download()

    assert os.stat(filename).st_size == file.size
    os.unlink(filename)
コード例 #10
0
ファイル: ia_metadata.py プロジェクト: mikemccabe/ia-wrapper
def main(argv):
    args = docopt(__doc__, argv=argv)
    item = internetarchive.Item(args['<identifier>'])

    # Check existence of item.
    if args['--exists']:
        if item.exists:
            stdout.write('{0} exists\n'.format(item.identifier))
            exit(0)
        else:
            stderr.write('{0} does not exist\n'.format(item.identifier))
            exit(1)

    # Modify metadata.
    elif args['--modify']:
        metadata = get_args_dict(args['--modify'])
        response = modify_metadata(args['<identifier>'], metadata)
        status_code = response['status_code']
        if not response['content']['success']:
            error_msg = response['content']['error']
            stderr.write('error: {0} ({1})\n'.format(error_msg, status_code))
            exit(1)
        stdout.write('success: {0}\n'.format(response['content']['log']))

    # Get metadata.
    elif args['--files']:
        for f in item.files():
            files_md = [
                f.item.identifier, f.name, f.source, f.format, f.size, f.md5
            ]
            stdout.write('\t'.join([str(x) for x in files_md]) + '\n')
    elif args['--formats']:
        formats = set([f.format for f in item.files()])
        stdout.write('\n'.join(formats) + '\n')
    elif args['--target']:
        metadata = []
        for key in args['--target']:
            if '/' in key:
                for i, k in enumerate(key.split('/')):
                    if i == 0:
                        md = item.metadata.get(k)
                    else:
                        if md:
                            md = md.get(k)
            else:
                md = item.metadata.get(key)
            if md:
                metadata.append(md)
        stdout.write('\t'.join([str(x) for x in metadata]) + '\n')
    else:
        metadata = dumps(item.metadata)
        stdout.write(metadata + '\n')
    exit(0)
コード例 #11
0
def get_file(item_name, file_name):
    """get a file from a newly-created item. Wait for file to land in item, retry if needed"""

    for i in range(5):
        print '  waiting 30 seconds for upload of', file_name
        time.sleep(30)

        item = ia.Item(item_name)
        f = item.file(file_name)
        if f is not None:
            return f

    raise KeyError, 'Could not retrieve file after 5 tries'
コード例 #12
0
 def export_to_ia(self, sha_value, **kwargs):
     """
     Called after a `Document` is signed and the hashes are calculated. Takes the
     file in `Document.doc_file` and uploads it to the internetarchive with the sha256 value as the
     filename.
     """
     item = internetarchive.Item(settings.IA_ITEM)
     md = dict(creator=settings.IA_CREATOR)
     key = sha_value + os.path.splitext(self.doc_file.name)[1]
     item.upload_file(self.doc_file,
                      key=key,
                      metadata=md,
                      access_key=settings.IA_ACCESS_KEY,
                      secret_key=settings.IA_SECRET_KEY)
コード例 #13
0
ファイル: archiver.py プロジェクト: emijrp/Balchivist
    def __init__(self, identifier='', retries=3, retrysleep=30):
        """
        This module is used for providing regular functions used for
        uploading files into the Internet Archive. It is an extension of
        the internetarchive python library, but with better error handling.

        - identifier (string): The identifier for the item.
        - retries (int): The number of times to retry a request to the server.
        - retrysleep (int): Time (in seconds) to sleep before the next request.
        """
        self.IAItem = internetarchive.Item(identifier, max_retries=retries)
        self.retries = retries

        # Files that are present by default in all Internet Archive items
        self.defaultFiles = [
            '%s_archive.torrent' % (identifier),
            '%s_files.xml' % (identifier),
            '%s_meta.sqlite' % (identifier),
            '%s_meta.xml' % (identifier)
        ]
コード例 #14
0
                continue
            dirty_metadata = dict((k, v) for k, v in zip(headers, row))
            metadata = compile_metadata(dirty_metadata)
            if len(metadata.keys()) <= 1:
                continue
            else:
                yield metadata


# main()
#_________________________________________________________________________________________
if __name__ == '__main__':
    tab_file = sys.argv[-1]
    errors = []
    for md in iter_csv(tab_file):
        item = internetarchive.Item(md['identifier'])
        r = item.modify_metadata(md)
        if r['status_code'] != 200:
            message = '{0}\tERROR! {1}\n'.format(md['identifier'],
                                                 r['content'])
            sys.stderr.write(message)
            errors.append(r)
        else:
            message = '{0}\thttps:{1}\n'.format(md['identifier'],
                                                r['content']['log'])
            sys.stdout.write(message)
    if errors == []:
        sys.exit(0)
    else:
        sys.exit(1)
コード例 #15
0
os.environ['AWS_ACCESS_KEY_ID'] = args.accesskey
os.environ['AWS_SECRET_ACCESS_KEY'] = args.secretkey

print 'Reading', str(args.sqlitefile)
con = sqlite3.connect(args.sqlitefile)
con.row_factory = sqlite3.Row

cur = con.cursor()
cur.execute('SELECT * FROM output')

for row in cur:
    if not os.path.isfile(row['path']):
        print row['parent_item_id'], '|', '000', '|', row[
            'ia_identifier'], '|', row['path']
    else:
        item = internetarchive.Item(row['ia_identifier'])
        result = item.upload(
            row['path'],
            metadata=dict(
                collection=args.collection,
                mediatype='audio',
                language='yid',
                title=row['item_title'],
                description=row['item_description'],
                author=row['item_author'],
                #				title_yivo = row['title_yivo'],
                #				author_last = row['author_last'],
                #				author_first = row['author_first'],
                #				reader_last = row['reader_last'],
                #				reader_first = row['reader_first'],
                #				author_last_eng = row['author_last_eng'],
コード例 #16
0
ファイル: test_item.py プロジェクト: wumpus/internetarchive
def test_item():
    item = internetarchive.Item('nasa')
    assert item.metadata['identifier'] == 'nasa'
コード例 #17
0
 def test_export_to_ia(self):
     doc = Document.objects.get(id=1)
     item = ia.Item(settings.IA_ITEM)
     fname = doc.sha256 + os.path.splitext(doc.doc_file.name)[1]
     i = item.get_file(fname)
     self.assertNotEqual(i, None, "The file is uploaded to the internetarchive")
コード例 #18
0
def test_item():
    item = internetarchive.Item('stairs')
    assert item.metadata['metadata']['identifier'] == 'stairs'
コード例 #19
0
			## If we have valid JSON, extract some metadata
			if data:
				metadata = {}
				metadata["title"] = str(data["fulltitle"].encode('ascii', 'ignore'))
				metadata["description"] = str(data["description"].encode('ascii', 'ignore')).replace("\n", "<br>")
				metadata["mediatype"] = "movies"
				metadata["collection"] = "opensource_movies"
				metadata["subject"] = myTags
				print("JSON parse successful! Checking identifier...")
				## Check to see if our identifier is in use
				item = ia.get_item(sanitized)
				if not item.exists:
					## Identifier not in use, let's upload
					print("Identifier cleared for use!")
					print("[uploading]")
					item = ia.Item(sanitized)
					response = item.upload(file, metadata=metadata, access_key=access_key, secret_key=secret_key)
					print("Server Response: " + str(response))
					## Check the response. An HTTP 200 is OK
					if "200" in str(response):
						print("Success, adding other items associated with "  + sanitized) 
						for otherFile in os.listdir(workingDirectory):
							if otherFile.startswith(commonFile) and not otherFile.endswith(".info.json"):
								print("Adding file: " + str(otherFile))
								response = item.upload(otherFile, access_key=access_key, secret_key=secret_key)
								print("Server Response: " + str(response))
								if "200" in str(response):
									print("Done adding file: " + str(otherFile) + " to item " + str(sanitized))
								else:
									print(bcolors.FAIL + "[ERROR] Server responded with: " + str(response) + bcolors.ENDC + ". Skipping to next file for item")
						print("Success! Item populating at: " + bcolors.OKGREEN + "https://archive.org/details/" + sanitized + bcolors.ENDC)
コード例 #20
0
## !! will probably crash after 10 or so items !! feel free to edit the script to make it better for bigger collections
## See http://programminghistorian.org/lessons/data-mining-the-internet-archive for more detailed info

import internetarchive as ia
import time

coll = ia.Search('collection:xxxxxxxx') #fill this in -- searches for the ID of a collection in IA
## example of collection page: https://archive.org/details/johnjaycollegeofcriminaljustice
## the collection ID for that page is johnjaycollegeofcriminaljustice
## you can tell a page is a collection if it has a 'Spotlight Item' on the left

num = 0

for result in coll.results(): #for all items in a collection
    num = num + 1 #item count
    itemid = result['identifier']
    print 'Downloading: #' + str(num) + '\t' + itemid
    
    item = ia.Item(itemid)
    item.download() #download all associated files (large!)
    print '\t\t Download success.'
    
    print 'Pausing for 40 minutes'
    time.sleep(2400) # IA restricts the number of things you can download. Be nice to 
                     # their servers -- limit how much you download, too. For me, this
                     # time restriction is still not polite enough, and my connection gets
                     # cut off all the dang time.