Example #1
0
def extract_data(records, feed_stem, collection_title):
    """Extract IA data from API using provided IDs, and compose results into list of records that can be used to generate OPDS. See https://archive.org/services/docs/api/internetarchive/

    Args:
        records (list): list of dicts of form: {'bibid': <bibid>, 'id':<ia_id>, 'label':<link_label>}
        feed_stem (str): abbreviated label of feed, e.g., 'ia_mrp_feed'
        collection_title (str): human-readable string, e.g.: "Missionary Research Pamphlets"

    Returns:
        dict: form of {'data': <the_output>, 'errors': <the_errors>}
    """

    the_output = []
    the_errors = []

    # check for duplicate ids and report them (they will be processed anyway)
    the_ids = [r['id'] for r in records]
    dupe_ids = find_duplicates(the_ids)
    dupe_errors = [[
        str(datetime.today()), feed_stem, r['bibid'], r['id'], 'Duplicate ID'
    ] for r in records if r['id'] in dupe_ids]
    # pprint(dupe_errors)
    the_errors += dupe_errors

    for record in records:

        record_files = get_item(record['id']).files
        record_metadata = get_item(record['id']).metadata
        if not record_metadata:
            # There was no data from the API
            print('ERROR: No data for ' + record['bibid'] + ' : ' +
                  record['id'] + '! Skipping...')
            the_errors.append([
                str(datetime.today()), feed_stem, record['bibid'],
                record['id'], 'No data!'
            ])
            continue

        if all('.pdf' not in f['name'] for f in record_files):
            # There is data but no PDF derivative to use.
            print('ERROR: No PDF available for ' + record['bibid'] + ' : ' +
                  record['id'] + '! Skipping...')
            the_errors.append([
                str(datetime.today()), feed_stem, record['bibid'],
                record['id'], 'No PDF file!'
            ])
            continue

        # Add the metadata to the output
        print(record_metadata['identifier'] + ': ' + record_metadata['title'])
        # Add CUL-specific metadata for use in generating feed XML.
        record_metadata['cul_metadata'] = {
            'bibid': record['bibid'],
            'feed_id': feed_stem,
            'collection_name': collection_title,
            'label': record['label']
        }
        the_output.append(record_metadata)

    return {'data': the_output, 'errors': the_errors}
Example #2
0
def test_get_item_with_kwargs():
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        item = get_item("nasa", http_adapter_kwargs={"max_retries": 13})
        assert isinstance(item.session.adapters["{0}//".format(protocol)].max_retries, urllib3.Retry)

    try:
        item = get_item("nasa", request_kwargs={"timeout": 0.0000000000001})
    except Exception as exc:
        assert "Connection to archive.org timed out" in str(exc)
Example #3
0
def test_get_item_with_kwargs():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        item = get_item('nasa', http_adapter_kwargs={'max_retries': 13})
        assert isinstance(item.session.adapters['{0}//'.format(PROTOCOL)].max_retries,
                          urllib3.Retry)

    try:
        get_item('nasa', request_kwargs={'timeout': .0000000000001})
    except Exception as exc:
        assert 'timed out' in str(exc)
Example #4
0
def test_get_item_with_kwargs():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add_metadata_mock('nasa')
        item = get_item('nasa', http_adapter_kwargs={'max_retries': 13})
        assert isinstance(item.session.adapters['{0}//'.format(PROTOCOL)].max_retries,
                          urllib3.Retry)

    try:
        get_item('nasa', request_kwargs={'timeout': .0000000000001})
    except Exception as exc:
        assert 'timed out' in str(exc)
Example #5
0
def get_valid_identifier_suffix(library, Id):
    item = get_item("%s_%s_%s_1" % ('bub', library, Id))
    if item.exists == False:
        item = get_item("%s_%s_%s" % ('bub', library, Id))
        if item.exists == False:
            return Id
    for index in range(2, 10):
        item = get_item("%s_%s_%s_%s" % ('bub', library, Id, index))
        if item.exists == False:
            return Id + "_" + str(index)
    item = get_item(urandom(16).encode("hex"))
    return item
Example #6
0
def get_valid_identifier_suffix(library, Id):
    item = get_item("%s_%s_%s_1" % ("bub", library, Id))
    if item.exists == False:
        item = get_item("%s_%s_%s" % ("bub", library, Id))
        if item.exists == False:
            return Id
    for index in range(2, 10):
        item = get_item("%s_%s_%s_%s" % ("bub", library, Id, index))
        if item.exists == False:
            return Id + "_" + str(index)
    item = get_item(urandom(16).encode("hex"))
    return item
Example #7
0
 def get_valid_identifier(self, primary = True):
     """Iterate over identifiers suffixed by _<no>, until found."""
     item = ia.get_item("%s_%s_%s" %('bub', self.library, self.Id))
     if item.exists == False and primary == True:
         return item
     for index in range(2,10):
         item = ia.get_item("%s_%s_%s_%s" %('bub', self.library, self.Id, index))
         if item.identifier == self.ia_identifier:
             continue
         if item.exists == False:
             return item
     item = ia.get_item(urandom(16).encode("hex"))
     return item
Example #8
0
 def get_valid_identifier(self, primary = True):
     """Iterate over identifiers suffixed by _<no>, until found."""
     item = ia.get_item("%s_%s_%s" %('bub', self.library, self.Id))
     if item.exists == False and primary == True:
         return item
     for index in range(2,10):
         item = ia.get_item("%s_%s_%s_%s" %('bub', self.library, self.Id, index))
         if item.identifier == self.ia_identifier:
             continue
         if item.exists == False:
             return item
     item = ia.get_item(urandom(16).encode("hex"))
     return item
Example #9
0
def test_get_item_with_kwargs():
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        item = get_item('nasa', http_adapter_kwargs={'max_retries': 13})
        assert isinstance(item.session.adapters['{0}//'.format(protocol)].max_retries,
                          urllib3.Retry)

    try:
        item = get_item('nasa', request_kwargs={'timeout': .0000000000001})
    except Exception as exc:
        assert 'Connection to archive.org timed out' in str(exc)
def check_for_new_items(username, password, collection, collections_db):
    """username->(String) IA username
       password->(String) IA password
       collection->(String) IA identifier for the collection to watch
       collections_db->(String) file path of plain text database of collections to ignore.

       ident_1
       ident_d
       ident_3

       
       returns->(list) list of identifiers of new items in collection

       Checks if there is a new group of scans in the collection from a list in a text file"""

    configure(username, password)  # Configure log in information for IA
    downloaded_collections = []
    with open(collections_db) as f:
        for line in f:
            downloaded_collections.append(line.rstrip("\n"))
    new_collections = []
    for book in get_item(collection).contents():
        if (book.identifier not in downloaded_collections):
            new_collections.append(book.identifier)
    return new_collections
def main():
    search = internetarchive.search.Search(
        'urlteam terroroftinytown -collection:test_collection')

    for result in search:
        print(result)

        item = internetarchive.get_item(result['identifier'])

        if not item.metadata['subject'] == 'urlteam;terroroftinytown':
            continue

        subjects = ['urlteam', 'terroroftinytown', 'archiveteam']

        for file_obj in item.iter_files():
            if file_obj.name.endswith('.zip'):
                shortener_id = file_obj.name.split('.', 1)[0]
                subjects.append(shortener_id)

        new_subject = ';'.join(subjects)

        print(new_subject)

        item.modify_metadata({'subject': new_subject},
                             access_key=ACCESS_KEY,
                             secret_key=SECRET_KEY)
Example #12
0
def validate():
    site_id = None
    args = get_args()
    if 'site' in args and args.site:
        site_id = args.site
    """Check all synced files to make sure their md5 hash matches the hash
    stored on Archive.org"""
    synced_session = ytarchive().sessionsGetSyncedOldest(site_id)

    if synced_session:
        session_log = ytarchive().logsGetSynced(synced_session.id)
        validation_time = time.time() - (60 * 60)

        if session_log and session_log.time < validation_time:
            archive_info = get_item(synced_session.archive_id)
            session_files = ytarchive().filesGetSynced(synced_session.id)
            valid = validate_files(session_files, archive_info)

            if valid:
                ytarchive().sessionsUpdate({
                    'id': synced_session.id,
                    'validated': True
                })
                log(synced_session, "Session files validated",
                    c.SESSION_SYNCED)
                cleanup_files(synced_session)
            else:
                ytarchive().sessionsUpdate({
                    'id': synced_session.id,
                    'state': c.SESSION_FAILED,
                    'validated': False
                })
                log(synced_session, "Session files failed validation",
                    c.SESSION_FAILED, c.LOG_ERROR)
Example #13
0
File: sample.py Project: edsu/spn
def item_summary(item_id):
    print("summarizing %s" % item_id)

    # IA's api can thrown errors so try 10 times before failing
    tries = 0
    while tries < 10:
        try:
            item = ia.get_item(item_id)
            break
        except Exception as e:
            print('caught exception: %s' % e)
            time.sleep(10)
            tries += 1

    size = 0
    for file in item.item_metadata.get('files', []):
        if file['name'].endswith('arc.gz'):
            size += int(file['size'])

    m = re.match('^.+-(\d\d\d\d)(\d\d)(\d\d)',
                 item.item_metadata['metadata']['identifier'])
    date = '%s-%s-%s' % m.groups()

    if 'metadata' not in item.item_metadata:
        print('missing metadata %s' % item_id)
        return None, None

    return date, size
Example #14
0
def uploadTikTok(username, tiktok, deletionStatus, file):
    regex = re.compile('[0-9]{17}')
    regexA = re.compile('[0-9]{18}')
    regexB = re.compile('[0-9]{19}')
    regexC = re.compile('[0-9]{8}')
    regexD = re.compile('[0-9]{9}')
    if (os.path.isdir(tiktok)
            and (regex.match(str(tiktok)) or (regexA.match(str(tiktok))) or
                 (regexB.match(str(tiktok))) or (regexC.match(str(tiktok))) or
                 (regexD.match(str(tiktok))))):
        item = get_item('tiktok-' + tiktok)
        item.upload('./' + tiktok + '/',
                    verbose=True,
                    checksum=True,
                    delete=deletionStatus,
                    metadata=dict(collection='opensource_media',
                                  subject='tiktok',
                                  creator=username,
                                  title='TikTok Video by ' + username,
                                  originalurl='https://www.tiktok.com/@' +
                                  username + '/video/' + tiktok,
                                  scanner='TikUp 2020.07.01'),
                    retries=9001,
                    retries_sleep=60)
        if (deletionStatus == True):
            os.rmdir(tiktok)
        print()
        print('Uploaded to https://archive.org/details/tiktok-' + tiktok)
        print()
        if file != None:
            file.write(str(tiktok))
            file.write('\n')
Example #15
0
 def sequence(self, book):
     """
     :param [NGramProcessor] pipeline: a list of NGramProcessors that run modules
     :param  [str|ia.Item] book: an Archive.org book Item or Item.identifier
     :param int rows: limit how many results returned
     :param int page: starting page to offset search results
     """
     try:
         sequence_tic = time.perf_counter()
         sq = self.Sequence(copy.deepcopy(self.pipeline))
         sq.book = book if type(book) is ia.Item else ia.get_item(book)
         if sq.book.exists:
             for p in sq.pipeline:
                 sq.pipeline[p].run(sq.book)
             sequence_toc = time.perf_counter()
             sq.total_time = round(sequence_toc - sequence_tic, 3)
             return sq
         else:
             print(sq.book.identifier + ' - Item cannot be found.')
             logging.error(sq.book.identifier + ' - Item cannot be found.')
     except IndexError:
         print(sq.book.identifier +
               ' - does not have DjvuXML and/or DjvuTXT to be sequenced!')
         logging.error(
             sq.book.identifier +
             ' - does not have DjvuXML and/or DjvuTXT to be sequenced!')
     except requests.exceptions.HTTPError:
         print(
             sq.book.identifier +
             ' - DjvuXML and/or DjvuTXT is forbidden and can\'t be sequenced!'
         )
         logging.error(
             sq.book.identifier +
             - 'DjvuXML and/or DjvuTXT is forbidden and can\'t be sequenced!'
         )
Example #16
0
 def upload_ia_item(self):
     logger.debug("Uploading IA item for {}".format(self.ia_id))
     if not self.has_image and not self.has_crop:
         logger.debug("No images to upload")
         return None
     files = []
     if self.has_image:
         saved_image = self.save_image()
         files.append(saved_image)
     if self.has_crop:
         saved_crop = self.save_crop()
         files.append(saved_crop)
     internetarchive.upload(
         self.ia_id,
         files,
         metadata=self.ia_metadata,
         access_key=settings.IA_ACCESS_KEY_ID,
         secret_key=settings.IA_SECRET_ACCESS_KEY,
         checksum=False,
         verbose=True
     )
     if self.has_image:
         os.remove(saved_image)
     if self.has_crop:
         os.remove(saved_crop)
     return internetarchive.get_item(self.ia_id)
def check_for_new_items(username,password,collection,collections_db):
    """username->(String) IA username
       password->(String) IA password
       collection->(String) IA identifier for the collection to watch
       collections_db->(String) file path of plain text database of collections to ignore.

       ident_1
       ident_d
       ident_3

       
       returns->(list) list of identifiers of new items in collection

       Checks if there is a new group of scans in the collection from a list in a text file"""

    configure(username,password) # Configure log in information for IA
    downloaded_collections = []
    with open(collections_db) as f:
        for line in f:
            downloaded_collections.append(line.rstrip("\n"))
    new_collections = []
    for book in get_item(collection).contents():
        if(book.identifier not in downloaded_collections):
            new_collections.append(book.identifier)
    return new_collections
Example #18
0
def upload_item(item_dir):
    all_files = ['{0}/{1}'.format(item_dir, x) for x in os.listdir(item_dir)]

    # Make sure the item has at the very least a PDF and metadata.
    required_files = [
        '{0}/{0}.pdf'.format(item_dir), '{0}/{0}.json'.format(item_dir)
    ]
    for required_file in required_files:
        assert any(f == required_file for f in all_files)

    # Parse metadata.
    json_md = '{0}/{0}.json'.format(item_dir)
    with open(json_md) as fp:
        md = json.load(fp)
    assert 'collection' in md

    # We don't want to upload the JSON file, remove it from all_files.
    files = [x for x in all_files if x != '{0}/{0}.json'.format(item_dir)]

    item = get_item(item_dir)
    rs = item.upload(files,
                     metadata=md,
                     retries=100,
                     delete=True,
                     checksum=True)
    if all(r.status_code == 200 for r in rs):
        with open('uploaded', 'a'):
            os.utime('uploaded', None)
    return rs
Example #19
0
def main():
    parser = argparse.ArgumentParser(description='Correct the title of the entries of a given set of rounds.')
    parser.add_argument('rounds', metavar='ROUND', nargs='+',
                        help='Round to correct')
    parser.add_argument('--metadata-file', default='metadata_rnd_1_to_89.csv',
                        help='Path of the metadata file')

    args = parser.parse_args()

    # Load metadata
    reader = csv.DictReader(open(args.metadata_file))

    # Iterate over entries of those rounds
    for d in reader:
        if d['round'] in args.rounds:
            place = d['place']
            author = d['author']
            padded_round = "{0:03d}".format(int(d['round']))
            title = d['title']

            # Don't try to correct empty entries
            if author == 'SDCTester':
                continue

            root_target_file = 'SDC' + padded_round + '-' \
                + padded_place(place) + '_' + author + '_-_' \
                + title.replace(' ', '_')
            root_target_file_decoded = root_target_file.decode('utf8')
            target_file = 'files/' + root_target_file_decoded + ".flac"
            new_title = place + ' - ' + root_target_file_decoded
            md = {'title': new_title}
            item = get_item('SDCompo_Round_' + padded_round)
            print "Round {} {}:".format(d['round'], author), title, "->", new_title
            item.modify_metadata(md, target=target_file)
Example #20
0
def upload_to_internet_archive(self, link_guid):
    # setup
    asset = Asset.objects.get(link_id=link_guid)
    link = asset.link
    identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX+link_guid
    warc_path = os.path.join(asset.base_storage_path, asset.warc_capture)

    # create IA item for this capture
    item = internetarchive.get_item(identifier)
    metadata = {
        'collection':settings.INTERNET_ARCHIVE_COLLECTION,
        'mediatype':'web',
        'date':link.creation_timestamp,
        'title':'Perma Capture %s' % link_guid,
        'creator':'Perma.cc',

        # custom metadata
        'submitted_url':link.submitted_url,
        'perma_url':"http://%s/%s" % (settings.HOST, link_guid)
    }

    # upload
    with default_storage.open(warc_path, 'rb') as warc_file:
        success = item.upload(warc_file,
                              metadata=metadata,
                              access_key=settings.INTERNET_ARCHIVE_ACCESS_KEY,
                              secret_key=settings.INTERNET_ARCHIVE_SECRET_KEY,
                              verbose=True,
                              debug=True)
    if success:
        print "Succeeded."
    else:
        print "Failed."
        self.retry(exc=Exception("Internet Archive reported upload failure."))
Example #21
0
def test_get_item_with_config():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        item = get_item('nasa', config={'s3': {'access': 'key'}})
        assert item.session.access_key == 'key'
Example #22
0
def test_get_item():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        item = get_item('nasa')
        assert item.identifier == 'nasa'
Example #23
0
def main(argv):
    args = docopt(__doc__, argv=argv)
    verbose = args['--verbose']
    item = get_item(args['<identifier>'])

    # Files that cannot be deleted via S3.
    no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite']

    if verbose:
        sys.stdout.write('Deleting files from {0}\n'.format(item.identifier))

    if args['--all']:
        files = [f for f in item.iter_files()]
        args['--cacade'] = True
    else:
        files = [item.get_file(f) for f in args['<file>']]

    for f in files:
        if not f:
            if verbose:
                sys.stderr.write(' error: "{0}" does not exist\n'.format(f.name))
            sys.exit(1)
        if any(f.name.endswith(s) for s in no_delete):
            continue
        resp = f.delete(verbose=args['--verbose'], cascade_delete=args['--cascade'])
        if resp.status_code != 204:
            error = parseString(resp.content)
            msg = get_xml_text(error.getElementsByTagName('Message'))
            sys.stderr.write(' error: {0} ({1})\n'.format(msg, resp.status_code))
            sys.exit(1)
Example #24
0
 def __init__(self,ident):
     """
     set up directory info - download from archive if necessary
     """
     self.ident = ident
     self.dir = os.path.join(Film.root_dir,ident)
     self.clip_dir = os.path.join(self.dir,"clips")
     self.failed = False
     if os.path.isdir(self.dir) == False:
         item = get_item(ident)
         ogg = [x['name'] for x in item.files if ".ogv" in x['name']]
         meta = [x['name'] for x in item.files if "_meta.xml" in x['name']]
         if ogg and meta:
             ogg = ogg[:1]
             meta = meta[:1]
             os.makedirs(self.dir)
             os.makedirs(self.clip_dir)
             download(ident,files=ogg+meta,destdir=Film.root_dir, verbose=True)
         else:
             self.failed = True
     
     if self.failed == False:
         self.ogv = [x for x in os.listdir(self.dir) if ".ogv" in x]
         self.meta = [x for x in os.listdir(self.dir) if "_meta.xml" in x]
         
         if self.ogv and self.meta:
             self.ogv = self.ogv[0]
             self.meta =self.meta[0]
             self.load_meta()
         else:
             self.failed = True
Example #25
0
def append_meta(identifier, add_subject):
    # obtain existing metadata for given item
    item = get_item(identifier)
    subject = item.metadata['subject']

    # if subjects are given as a list, convert to semicolon-separated list
    if isinstance(subject, list):
        l = ""
        for element in subject:
            l += "%s;" % element
        subject = l

    # append new subject to existing subject
    if str(subject).endswith(';'):
        new_subject = str(subject) + add_subject
    else:
        new_subject = str(subject) + ';' + add_subject

    # upload new metadata
    r = item.modify_metadata(dict(subject=new_subject))

    # check if metadata successfully modified
    if (r.status_code == 200):
        print(":: [Identifier] Item: [%s] %s" %
              (identifier, item.metadata['title']))
        print("Subjects '%s' successfully appended." % add_subject)
        print("Result: %s" % new_subject)
    else:
        print("Failed to add new subjects.")
def mk_mirror(target):
	'''Make the mirror'''
	session = ArchiveSession()
	target = 'collection:' + target
	print("Attempting to download collection: " + target)
	search = ia.Search(session, target)

	## Because the internetarchive module won't return us a list
	## we'll have to make our own.
	current_item = 1
	total_item = 0
	collection = []
	for entry in search:
		collection.append(entry)
		total_item += 1

	## Go through all items of the collection and download
	for entry in collection:
		item_id = entry['identifier']
		print('Downloading ' + str(current_item) + '/' + str(total_item) + '\t'\
			+ item_id)

		item = get_item(item_id)
		status = item.download()
		print('\t\t Download successful')
		current_item += 1
Example #27
0
    def getFileList(self):
        """
        This function is used to get the list of files in an item and excludes
        the default files that are present in all Internet Archive items.

        Returns: List of files in the item excluding default files in
        alphabetical order. False if an error has occurred.
        """
        tries = 0
        while tries < self.retries:
            try:
                iaitem = internetarchive.get_item(identifier=self.identifier)
                break
            except Exception as exception:
                self.handleException(exception=exception)
                if tries == self.retries:
                    return False
                else:
                    tries += 1
                    time.sleep(60*tries)
        filelist = []
        for thefile in iaitem.files:
            filename = thefile['name']
            if filename in self.defaultFiles:
                continue
            else:
                filelist.append(filename)
        return sorted(filelist)
Example #28
0
def test_get_item():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        item = get_item('nasa')
        assert item.identifier == 'nasa'
Example #29
0
def download_item(identifier):
    '''
    Download the mp3 file associated with identifier from the catalog.
    
    Inputs:
        identifier: str, identifier for an Item object
    Returns:
    '''

    item = ia.get_item(identifier)
    f_name = ''

    # This loop finds the .mp3 associated with the audio file.
    for f in item.iter_files():
        if f.name[-4:] == '.mp3':
            f_name = f.name
            break

    assert f_name != '', 'No .mp3 file associated with item {}.\
        Try a different item'.format(identifier)
    
    f = item.get_file(f_name)

    if f.size <= MAX_SIZE:
        f.download(SOUND_DIR + f_name)
        return f.name
    else:
        print('File size is', f.size, 'bytes')
        print('File size exceeds', MAX_SIZE, 'bytes')
        return None 
Example #30
0
def test_get_item_with_config():
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        item = get_item('nasa', config={'s3': {'access': 'key'}})
        assert item.session.access_key == 'key'
    def __init__(self,
                 archive_id,
                 metadata=None,
                 config_file_path=None,
                 repo_base=None):
        """
        
        :param archive_id: 
        :param config_file_path:
        :param repo_base: In archive item, place each file in a folder mirroring its local location.
        """
        self.repo_base = repo_base
        self.archive_id = archive_id
        self.archive_session = internetarchive.get_session(
            config_file=config_file_path)
        self.archive_item = internetarchive.get_item(
            archive_id, config_file=config_file_path)
        self.metadata = metadata
        logging.info(self.archive_item.identifier)

        self.original_item_files = list(
            filter(
                lambda x: x["source"] == "original" and not x["name"].
                startswith(self.archive_item.identifier) and not x[
                    "name"].startswith("_"), self.archive_item.files))
        self.original_item_file_names = sorted(
            map(lambda x: x["name"], self.original_item_files))
Example #32
0
def _upload_files(args, identifier, local_file, upload_kwargs):
    verbose = True if args['--quiet'] is False else False
    config = {} if not args['--log'] else {'logging': {'level': 'INFO'}}
    item = get_item(identifier, config=config)
    if verbose:
        sys.stdout.write('{0}:\n'.format(item.identifier))

    response = item.upload(local_file, **upload_kwargs)
    if args['--debug']:
        for i, r in enumerate(response):
            if i != 0:
                sys.stdout.write('---\n')
            headers = '\n'.join(
                [' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()]
            )
            sys.stdout.write('Endpoint:\n {0}\n\n'.format(r.url))
            sys.stdout.write('HTTP Headers:\n{0}\n'.format(headers))
    else:
        for resp in response:
            if not resp:
                continue
            if resp.status_code == 200:
                continue
            error = parseString(resp.content)
            code = get_xml_text(error.getElementsByTagName('Code'))
            msg = get_xml_text(error.getElementsByTagName('Message'))
            sys.stderr.write(
                'error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg)
            )
            sys.exit(1)
def upload_single_show_to_internetarchive(show_info: RefinedShow):
    show_title = f"Hooting Yard On The Air: {show_info.title()}"
    upload_id = f"{IA_PRFX}_{show_info.id}"
    log.info(f"Attempting to upload {show_info.id}, Title: {show_title}")

    show_text = show_info.get_title_and_text()
    show_toc = show_info.get_toc()

    md = {
        "collection": "hooting-yard",
        "description": show_toc,
        "mediatype": "audio",
        "title": show_title,
        "creator": "Frank Key",
        "date": show_info.tx_date().isoformat(),
        "notes": show_text,
    }

    log.info(f"Metadata: {pprint.pformat(md)}")

    try:
        item: Item = get_item(upload_id)
        log.info(f"Found an item: {item}")
        item.modify_metadata(metadata=md)
    except internetarchive.exceptions.ItemLocateError:
        r = upload(
            identifier=upload_id,
            files=[show_info.get_audio_file().path],
            metadata=md,
            verbose=True,
        )
        assert r[0].status_code == 200
        log.info(f"Completed upload {show_info.id}")

    return upload_id
def main():
    search = internetarchive.search.Search('urlteam terroroftinytown -collection:test_collection')

    for result in search:
        print(result)

        item = internetarchive.get_item(result['identifier'])

        if not item.metadata['subject'] == 'urlteam;terroroftinytown':
            continue

        subjects = ['urlteam', 'terroroftinytown', 'archiveteam']

        for file_obj in item.iter_files():
            if file_obj.name.endswith('.zip'):
                shortener_id = file_obj.name.split('.', 1)[0]
                subjects.append(shortener_id)

        new_subject = ';'.join(subjects)

        print(new_subject)

        item.modify_metadata(
            {'subject': new_subject},
            access_key=ACCESS_KEY, secret_key=SECRET_KEY
        )
Example #35
0
def main(argv):
    args = docopt(__doc__, argv=argv)
    item = get_item(args['<identifier>'])

    # Check existence of item.
    if args['--exists']:
        if item.exists:
            sys.stdout.write('{0} exists\n'.format(item.identifier))
            sys.exit(0)
        else:
            sys.stderr.write('{0} does not exist\n'.format(item.identifier))
            sys.exit(1)

    # Modify metadata.
    elif args['--modify'] or args['--append']:
        append = True if args['--append'] else False
        metadata_args = args['--modify'] if args['--modify'] else args['--append']
        metadata = get_args_dict(metadata_args)
        response = modify_metadata(args['<identifier>'], metadata, append=append)
        if not response.json()['success']:
            error_msg = response.json()['error']
            sys.stderr.write('error: {0} ({1})\n'.format(error_msg, response.status_code))
            sys.exit(1)
        sys.stdout.write('success: {0}\n'.format(response.json()['log']))

    # Get metadata.
    elif args['--formats']:
        formats = set([f.format for f in item.iter_files()])
        sys.stdout.write('\n'.join(formats) + '\n')
    else:
        metadata = dumps(item.metadata)
        sys.stdout.write(metadata + '\n')
    sys.exit(0)
Example #36
0
 def test_upload(self):
     s = self.start_ia_session()
     item = get_item('opencontext-test-item', archive_session=s, debug=True)
     r = item.upload(
         'https://artiraq.org/static/opencontext/abydos-looting/full/fig001.jpg'
     )
     return r
Example #37
0
 def get_ia_item(self, identifier):
     try:
         item = get_item(identifier, archive_session = self.session)
     except Exception as e:
         self.logger.warn('Could not get item %s. Error %s' , identifier, e) 
         item = None 
     return item
    def validate(self):
        """Validate the form."""
        initial_validation = super().validate()
        if not initial_validation:
            return False

        url, track_id = canonify_track_url(self.url.data)

        try:
            item = get_item(track_id, request_kwargs={'timeout': 30})
            metadata = item.item_metadata.get('metadata')
            if not metadata:
                raise ValueError("'%s' not found." % track_id)
        except Exception as exc:
            self.add_form_error(
                "Could not get meta data from Archive.org: %s" % exc)
        else:
            if metadata.get(
                    'title',
                    '').strip().lower() != self.title.data.strip().lower():
                self.add_form_error(
                    "Title does not match title in Archive.org meta data.")

            if metadata.get(
                    'creator',
                    '').strip().lower() != self.artist.data.strip().lower():
                self.add_form_error("Artist does not match creator / author "
                                    "in Archive.org meta data.")

            flac = None
            for file in getattr(item, 'files', []):
                if file.get('format') == 'Flac':
                    flac = file
                    break
            else:
                self.add_form_error("Track not available in FLAC format.")

            if flac:
                try:
                    length = float(flac.get('length', 0))
                except (TypeError, ValueError):
                    length = 0

                min_length = current_app.config.get('MIN_TRACK_LENGTH', 60.0)
                max_length = current_app.config.get('MAX_TRACK_LENGTH', 300.0)

                if length < min_length:
                    self.add_form_error(
                        "Track does not have minimum required duration (%s min.)."
                        % format_duration(min_length))
                elif length > max_length:
                    self.add_form_error(
                        "Track exceeds maximum allowed duration (%s min.)." %
                        format_duration(max_length))
            else:
                self.add_form_error(
                    "Missing meta data for FLAC download of track.")

        return not self.errors.get('form')
Example #39
0
def backup_report(ig, year, report_id, options=None):
    if options is None:
        options = {}

    logging.warn("")

    # this had better be there
    report = json.load(open(metadata_path(ig, year, report_id)))
    if report.get("unreleased"):
        logging.warn("[%s][%s][%s] Unreleased report, skipping." % (ig, year, report_id))
        return True

    if already_uploaded(ig, year, report_id) and (options.get("force") is not True):
        logging.warn("[%s][%s][%s] Already backed up, skipping." % (ig, year, report_id))
        return True

    logging.warn("[%s][%s][%s] Initializing item." % (ig, year, report_id))
    item_id = item_id_for(ig, year, report_id)
    item = internetarchive.get_item(item_id)

    if item.exists and (options.get("force") is not True):
        logging.warn("[%s][%s][%s] Ooooops, item does exist. Marking as done, and stopping." % (ig, year, report_id))
        mark_as_uploaded(ig, year, report_id)
        return True

    metadata = collection_metadata()
    metadata.update(item_metadata(report))

    # 1) add the metadata file, and attach the IA item metadata to it
    logging.warn("[%s][%s][%s] Sending metadata!" % (ig, year, report_id))
    success = upload_files(item, metadata_path(ig, year, report_id), metadata, options)

    if not success:
        logging.warn("[%s][%s][%s] :( Error sending metadata." % (ig, year, report_id))
        return False

    # 2) Unless --meta is on, upload the associated report files.
    if not options.get("meta"):
        report_path = file_path(ig, year, report_id, report["file_type"])
        text_path = file_path(ig, year, report_id, "txt")

        to_upload = []
        if os.path.exists(report_path):
            to_upload.append(report_path)
        if (report_path != text_path) and os.path.exists(text_path):
            to_upload.append(text_path)

        if len(to_upload) > 0:
            logging.warn("[%s][%s][%s] Sending %i report files!" % (ig, year, report_id, len(to_upload)))
            success = upload_files(item, to_upload, None, options)

        if not success:
            logging.warn("[%s][%s][%s] :( Error uploading report itself." % (ig, year, report_id))
            return False

    logging.warn("[%s][%s][%s] :) Uploaded:\n%s" % (ig, year, report_id, ia_url_for(item_id)))
    mark_as_uploaded(ig, year, report_id)

    return True
Example #40
0
def test_get_item_with_config_file(tmpdir, nasa_mocker):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)

    item = get_item('nasa', config_file='ia_test.ini')
    assert item.session.access_key == 'key2'
Example #41
0
def test_get_item_with_config_file(tmpdir, nasa_mocker):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)

    item = get_item('nasa', config_file='ia_test.ini')
    assert item.session.access_key == 'key2'
Example #42
0
def upload_to_ia():
    item = get_item(WARC_FILE)
    md = dict(mediatype='warc', creator='PhantomWARC')
    cdxmd = dict(mediatype='cdx', creator='PhantomWARC')
    item.upload(WARC_NAME, metadata=md, access_key=os.environ['IAS3_ACCESS_KEY'], secret_key=os.environ['IAS3_SECRET_KEY'])
    item.upload(CDX_NAME, metadata=cdxmd, access_key=os.environ['IAS3_ACCESS_KEY'], secret_key=os.environ['IAS3_SECRET_KEY'])
    IAURL = "https://archive.org/details/%s" % WARC_FILE
    print "WARC and CDX files uploaded to the Internet Archive as %s" % IAURL
Example #43
0
def test_internet_archive():
    from datetime import timedelta
    from django.utils import timezone
    import internetarchive
    from perma.models import Link
    from django.template.defaultfilters import truncatechars

    start_date = timezone.now() - timedelta(days=3)
    end_date = timezone.now() - timedelta(days=2)

    links = Link.objects.filter(
        internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date)
    )

    guid_results = dict()
    all_results = dict()

    c = {"s3": {"access": settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret": settings.INTERNET_ARCHIVE_SECRET_KEY}}
    internetarchive.get_session(config=c)

    for link in links:
        identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid
        item = internetarchive.get_item(identifier)
        warc_name = "%s.warc.gz" % link.guid

        try:
            fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")]
            guid_results["uploaded_file"] = warc_name in fnames
            if settings.INTERNET_ARCHIVE_COLLECTION == "test_collection":
                guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION
            else:
                guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION
            guid_results["title"] = item.metadata["title"] == "%s: %s" % (
                link.guid,
                truncatechars(link.submitted_title, 50),
            )
            guid_results["mediatype"] = item.metadata["mediatype"] == "web"
            guid_results["description"] = item.metadata["description"] == "Perma.cc archive of %s created on %s." % (
                link.submitted_url,
                link.creation_timestamp,
            )
            guid_results["contributor"] = item.metadata["contributor"] == "Perma.cc"
            guid_results["submitted_url"] = item.metadata["submitted_url"] == link.submitted_url
            guid_results["perma_url"] = item.metadata["perma_url"] == "http://%s/%s" % (settings.HOST, link.guid)
            guid_results["external-identifier"] = item.metadata["external-identifier"] == "urn:X-perma:%s" % link.guid
            if link.organization:
                guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % (
                    link.organization,
                    link.organization.registrar,
                )

        except Exception as e:
            guid_results["error"] = e
            pass

        all_results[link.guid] = guid_results

    print all_results
Example #44
0
def main(argv):
    args = docopt(__doc__, argv=argv)

    # Download specific files.
    if '/' in args['<identifier>']:
        identifier = args['<identifier>'].split('/')[0]
        files = [identifier.split('/')[1:]]
    else:
        identifier = args['<identifier>']
        files = args['<file>']

    item = get_item(identifier)
    if (args['--quiet'] is False) and (args['--dry-run'] is False):
        verbose = True
    else:
        verbose = False

    if files:
        if verbose:
            sys.stdout.write('{0}:\n'.format(identifier))
        for f in files:
            fname = f.encode('utf-8')
            if args['--no-directories']:
                path = fname
            else:
                path = os.path.join(identifier, fname)
            f = item.get_file(fname)
            if not f:
                sys.stderr.write(' {} doesn\'t exist!\n'.format(fname))
                continue
            if args['--dry-run']:
                sys.stdout.write(f.url + '\n')
            else:
                f.download(path, verbose, args['--ignore-existing'], args['--checksum'],
                           args['--destdir'])
        sys.exit(0)

    # Otherwise, download the entire item.
    if args['--source']:
        ia_source = args['--source']
    elif args['--original']:
        ia_source = ['original']
    else:
        ia_source = None

    item.download(
        concurrent=args['--concurrent'],
        source=ia_source,
        formats=args['--format'],
        glob_pattern=args['--glob'],
        dry_run=args['--dry-run'],
        verbose=verbose,
        ignore_existing=args['--ignore-existing'],
        checksum=args['--checksum'],
        destdir=args['--destdir'],
        no_directory=args['--no-directories'],
    )
    sys.exit(0)
Example #45
0
def main(argv):
    args = docopt(__doc__, argv=argv)

    # Download specific files.
    if '/' in args['<identifier>']:
        identifier = args['<identifier>'].split('/')[0]
        files = [identifier.split('/')[1:]]
    else:
        identifier = args['<identifier>']
        files = args['<file>']

    item = get_item(identifier)
    if (args['--quiet'] is False) and (args['--dry-run'] is False):
        verbose = True
    else:
        verbose = False

    if files:
        if verbose:
            sys.stdout.write('{0}:\n'.format(identifier))
        for f in files:
            fname = f.encode('utf-8')
            if args['--no-directories']:
                path = fname
            else:
                path = os.path.join(identifier, fname)
            f = item.get_file(fname)
            if not f:
                sys.stderr.write(' {} doesn\'t exist!\n'.format(fname))
                continue
            if args['--dry-run']:
                sys.stdout.write(f.url + '\n')
            else:
                f.download(path, verbose, args['--ignore-existing'],
                           args['--checksum'], args['--destdir'])
        sys.exit(0)

    # Otherwise, download the entire item.
    if args['--source']:
        ia_source = args['--source']
    elif args['--original']:
        ia_source = ['original']
    else:
        ia_source = None

    item.download(
        concurrent=args['--concurrent'],
        source=ia_source,
        formats=args['--format'],
        glob_pattern=args['--glob'],
        dry_run=args['--dry-run'],
        verbose=verbose,
        ignore_existing=args['--ignore-existing'],
        checksum=args['--checksum'],
        destdir=args['--destdir'],
        no_directory=args['--no-directories'],
    )
    sys.exit(0)
Example #46
0
def test_get_item_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open("ia_test.ini", "w") as fh:
        fh.write(test_conf)
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        item = get_item("nasa", config_file="ia_test.ini")
        assert item.session.access_key == "key2"
Example #47
0
def _upload_files(args, identifier, local_file, upload_kwargs, prev_identifier=None):
    verbose = True if args['--quiet'] is False else False
    config = {} if not args['--log'] else {'logging': {'level': 'INFO'}}
    item = get_item(identifier, config=config)
    if (verbose) and (prev_identifier != identifier):
        sys.stdout.write('{0}:\n'.format(item.identifier))

    try:
        if args['--remote-name']:
            files = {args['--remote-name']: local_file}
        else:
            files = local_file
        response = item.upload(files, **upload_kwargs)
    except HTTPError as exc:
        response = [exc.response]
        if not response[0]:
            sys.exit(1)
        if response[0].status_code == 403:
            if (not item.session.access_key) and (not item.session.secret_key):
                sys.stderr.write('\nIAS3 Authentication failed. Please set your IAS3 '
                                 'access key and secret key \nvia the environment '
                                 'variables `IAS3_ACCESS_KEY` and `IAS3_SECRET_KEY`, '
                                 'or \nrun `ia configure` to add your IAS3 keys to your '
                                 'ia config file. You can \nobtain your IAS3 keys at the '
                                 'following URL:\n\n\t'
                                 'https://archive.org/account/s3.php\n\n')
            else:
                sys.stderr.write('\nIAS3 Authentication failed. It appears the keyset '
                                 '"{0}:{1}" \ndoes not have permission to upload '
                                 'to the given item or '
                                 'collection.\n\n'.format(item.session.access_key,
                                                          item.session.secret_key))
            sys.exit(1)

    if args['--debug']:
        for i, r in enumerate(response):
            if i != 0:
                sys.stdout.write('---\n')
            headers = '\n'.join(
                [' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()]
            )
            sys.stdout.write('Endpoint:\n {0}\n\n'.format(r.url))
            sys.stdout.write('HTTP Headers:\n{0}\n'.format(headers))

    else:
        for resp in response:
            if not resp:
                continue
            if (resp.status_code == 200) or (not resp.status_code):
                continue
            error = parseString(resp.content)
            code = get_xml_text(error.getElementsByTagName('Code'))
            msg = get_xml_text(error.getElementsByTagName('Message'))
            sys.stderr.write(
                'error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg)
            )
            sys.exit(1)
Example #48
0
def format_output(query_results):
    '''
    Generate a tuple of lists from search_results.
    
    Inputs:
        query_results: Search object containing results from a query
    Returns:
        output: tuple
    '''

    items = []
    item_fields = []
    attribute_fields = ['ID', 'Title', 'Creator', 'Description']

    # restrict query results to MAX_NUM_RESULTS
    i = MAX_NUM_RESULTS
    for result in query_results:
        if i<= 0:
            break
        else:
            identifier = result['identifier']
            item = ia.get_item(identifier)
            items.append(item)
            i += -1

    for item in items:
        keys = item.metadata.keys()
        identifier = item.metadata['identifier']

        # not all items have a key for each of the following
        # so there is a need for checking
        if 'title' in keys:
            title = item.metadata['title']
        else:
            title = ''

        if 'creator' in keys:
            creator = item.metadata['creator']
        else:
            creator = ''

        if 'description' in keys:
            description = item.metadata['description']

            # remove html tag for line breaks
            description = re.split('<br />', description)
            description = " ".join(description)

        else:
            description = ''

        item_fields.append([identifier, title, creator, description])
    
    output = (attribute_fields, item_fields)
    
    return output
Example #49
0
 def check_if_ia_item_exists(infodict):
     itemname = sanitize_identifier('%s-%s' % (infodict['extractor'],
                                               infodict['display_id']))
     item = internetarchive.get_item(itemname)
     if item.exists and self.verbose:
         print("\n:: Item already exists. Not downloading.")
         print('Title: %s' % infodict['title'])
         print('Video URL: %s\n' % infodict['webpage_url'])
         return 1
     return 0
Example #50
0
def main(argv):
    args = docopt(__doc__, argv=argv)
    item = get_item(args['<identifier>'])

    # Check existence of item.
    if args['--exists']:
        if item.exists:
            stdout.write('{0} exists\n'.format(item.identifier))
            exit(0)
        else:
            stderr.write('{0} does not exist\n'.format(item.identifier))
            exit(1)

    # Modify metadata.
    elif args['--modify']:
        metadata = get_args_dict(args['--modify'])
        response = modify_metadata(args['<identifier>'], metadata)
        status_code = response['status_code']
        if not response['content']['success']:
            error_msg = response['content']['error']
            stderr.write('error: {0} ({1})\n'.format(error_msg, status_code))
            exit(1)
        stdout.write('success: {0}\n'.format(response['content']['log']))

    # Get metadata.
    elif args['--files']:
        for i, f in enumerate(item.files()):
            if not args['--target']:
                files_md = [f.identifier, f.name, f.source, f.format, f.size, f.md5]
            else:
                files_md = [f.__dict__.get(k) for k in args['--target']]
            stdout.write('\t'.join([str(x) for x in files_md]) + '\n')
    elif args['--formats']:
        formats = set([f.format for f in item.files()])
        stdout.write('\n'.join(formats) + '\n')
    elif args['--target']:
        metadata = []
        for key in args['--target']:
            if '/' in key:
                for i, k in enumerate(key.split('/')):
                    if i == 0:
                        md = item.metadata.get(k)
                    else:
                        if md:    
                            md = md.get(k)
            else:
                md = item.metadata.get(key)
            if md:
                metadata.append(md)
        stdout.write('\t'.join([str(x) for x in metadata]) + '\n')
    else:
        metadata = dumps(item.metadata)
        stdout.write(metadata + '\n')
    exit(0)
Example #51
0
    def delete_imagepdf(self, item, abby_filegz):
        head, abby_file = os.path.split(abby_filegz)
        pdffile = re.sub('_abbyy.gz$', '.pdf', abby_file)

        itemobj = internetarchive.get_item(item)
        fileobj = internetarchive.File(itemobj, pdffile)
        if fileobj and fileobj.source == 'derivative' and \
                fileobj.format == 'Image Container PDF':
            fileobj.delete(access_key = self.access_key, headers= self.headers,\
                           secret_key = self.secret_key)    
            self.logger.warn('Old image pdf exists in %s. Deleted it', item)
Example #52
0
def save_items():
    subdirs = [d for d in next(os.walk(AUDIO_DIRS))[1]]
    num_subdirs = len(subdirs)
    items = {}
    for i, id in enumerate(subdirs):
        print str(i) + '/' + str(num_subdirs), id
        item = ia.get_item(id)
        items[id] = {}
        items[id]['metadata'] = item.metadata
        items[id]['files'] = item.files
    write_json(items, SBD_ITEMS)
Example #53
0
def main(argv):
    args = docopt(__doc__, argv=argv)
    verbose = True if args['--quiet'] is False else False

    if verbose is not False:
        sys.stdout.write('getting item: {0}\n'.format(args['<identifier>']))

    headers = get_args_dict(args['--header'])
    if args['--size-hint']:
        headers['x-archive-size-hint'] = args['--size-hint']

    upload_kwargs = dict(
        metadata=get_args_dict(args['--metadata']),
        headers=headers,
        debug=args['--debug'],
        queue_derive=True if args['--no-derive'] is False else False,
        ignore_preexisting_bucket=args['--ignore-bucket'],
        verbose=verbose,
        delete=args['--delete'])

    # Upload stdin.
    if args['<file>'] == ['-'] and not args['-']:
        sys.stderr.write('--remote-name is required when uploading from stdin.\n')
        call(['ia', 'upload', '--help'])
        sys.exit(1)
    if args['-']:
        local_file = TemporaryFile()
        local_file.write(sys.stdin.read())
        local_file.seek(0)
        upload_kwargs['key'] = args['--remote-name']
    # Upload files.
    else:
        local_file = args['<file>']

    config = {} if not args['--log'] else {'logging': {'level': 'INFO'}}
    item = get_item(args['<identifier>'], config=config)
    response = item.upload(local_file, **upload_kwargs)

    if args['--debug']:
        for i, r in enumerate(response):
            if i != 0:
                sys.stdout.write('---\n')
            headers = '\n'.join([' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()])
            sys.stdout.write('Endpoint:\n {0}\n\n'.format(r.url))
            sys.stdout.write('HTTP Headers:\n{0}\n'.format(headers))
    else:
        for resp in response:
            if resp.status_code == 200:
                continue
            error = parseString(resp.content)
            code = get_xml_text(error.getElementsByTagName('Code'))
            msg = get_xml_text(error.getElementsByTagName('Message'))
            sys.stderr.write('error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg))
            sys.exit(1)
Example #54
0
def get_item_metadata(identifier):
    '''
    :param identifier: corresponds to 'identifier' on <class 'internetarchive.item.Item'>
    :return: metadata (dict) from a single item in the collection
    '''
    item = ''
    try:
        item = internetarchive.get_item(identifier)
    except Exception:
        pass
    return item.metadata
Example #55
0
def test_get_item_with_config_file(tmpdir):
    tmpdir.chdir()
    test_conf = """[s3]\naccess = key2"""
    with open('ia_test.ini', 'w') as fh:
        fh.write(test_conf)
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        item = get_item('nasa', config_file='ia_test.ini')
        assert item.session.access_key == 'key2'
Example #56
0
 def getattr(self, path, fh=None):
     full_path = self._full_path(path)
     st = get_item(full_path)
     return {
         'st_atime': st['created'],
         'st_ctime': st['created'],
         'st_mtime': st['updated'],
         'st_size': st['item_size'],
         'st_gid': 0,
         'st_uid': 0
     }
Example #57
0
    def __init__(self, item_id, dst_dir, metadata):
        self.item_id = item_id
        self.item = get_item(item_id)  # create IA item
        self.dst_dir = dst_dir
        self.metadata = metadata

        # three strikes and we're out
        self.timeout = 3

        # IA S3 API keys
        self.access_key = None
        self.secret_key = None
Example #58
0
def _upload_files(args, identifier, local_file, upload_kwargs):
    verbose = True if args['--quiet'] is False else False
    config = {} if not args['--log'] else {'logging': {'level': 'INFO'}}
    item = get_item(identifier, config=config)
    if verbose:
        sys.stdout.write('{0}:\n'.format(item.identifier))

    try:
        response = item.upload(local_file, **upload_kwargs)
    except HTTPError as exc:
        response = [exc.response]
        if not response[0]:
            sys.exit(1)
        if response[0].status_code == 403:
            if (not item.session.access_key) and (not item.session.secret_key):
                sys.stderr.write('\nIAS3 Authentication failed. Please set your IAS3 '
                                 'access key and secret key \nvia the environment '
                                 'variables `IAS3_ACCESS_KEY` and `IAS3_SECRET_KEY`, '
                                 'or \nrun `ia configure` to add your IAS3 keys to your '
                                 'ia config file. You can \nobtain your IAS3 keys at the '
                                 'following URL:\n\n\t'
                                 'https://archive.org/account/s3.php\n\n')
            else:
                sys.stderr.write('\nIAS3 Authentication failed. It appears the keyset '
                                 '"{0}:{1}" \ndoes not have permission to upload '
                                 'to the given item or '
                                 'collection.\n\n'.format(item.session.access_key,
                                                          item.session.secret_key))
            sys.exit(1)

    if args['--debug']:
        for i, r in enumerate(response):
            if i != 0:
                sys.stdout.write('---\n')
            headers = '\n'.join(
                [' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()]
            )
            sys.stdout.write('Endpoint:\n {0}\n\n'.format(r.url))
            sys.stdout.write('HTTP Headers:\n{0}\n'.format(headers))

    else:
        for resp in response:
            if not resp:
                continue
            if resp.status_code == 200:
                continue
            error = parseString(resp.content)
            code = get_xml_text(error.getElementsByTagName('Code'))
            msg = get_xml_text(error.getElementsByTagName('Message'))
            sys.stderr.write(
                'error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg)
            )
            sys.exit(1)
 def __init__(self, item_id, dst_dir, metadata):
     self.item_id = item_id
     self.item = get_item(item_id)        # create IA item
     self.dst_dir = dst_dir
     self.metadata = metadata
     
     # three strikes and we're out
     self.timeout = 3
     
     # IA S3 API keys
     self.access_key = None
     self.secret_key = None