def test_search_items_fts(session):
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.POST,
                 '{0}//be-api.us.archive.org/ia-pub-fts-api'.format(PROTOCOL),
                 body=TEST_SCRAPE_RESPONSE)
        rsps.add_metadata_mock('nasa')

        r = search_items('nina simone',
                         full_text_search=True,
                         archive_session=session)
        print(r.search_url)
        assert r.fts == True
        assert r.dsl_fts == False
        assert r.query == '!L nina simone'
        assert r.params == {'count': 10000, 'q': '!L nina simone'}

        r = search_items('nina simone',
                         full_text_search=True,
                         dsl_fts=True,
                         archive_session=session)
        assert r.fts == True
        assert r.dsl_fts == True
        assert r.query == 'nina simone'
        assert r.params == {'count': 10000, 'q': 'nina simone'}
        r = search_items('nina simone', dsl_fts=True, archive_session=session)
        assert r.fts == True
        assert r.dsl_fts == True
        assert r.query == 'nina simone'
        assert r.params == {'count': 10000, 'q': 'nina simone'}
Exemple #2
0
def main(argv):
    args = docopt(__doc__, argv=argv)

    params = dict(p.split(':') for p in args['--parameters'])

    # format sort paramaters.
    if args['--sort']:
        for i, field in enumerate(args['--sort']):
            key = 'sort[{0}]'.format(i)
            params[key] = field.strip().replace(':', ' ')

    query = ' '.join(args['<query>'])
    if args['--itemlist']:
        fields = ['identifier']
    else:
        fields = args['--field']
    search = search_items(query,
                          fields=args['--field'],
                          params=params,
                          v2=args['--v2'])
    if args['--number-found']:
        sys.stdout.write('{0}\n'.format(search.num_found))
        sys.exit(0)
    for result in search:
        try:
            if args['--itemlist']:
                sys.stdout.write(result.get('identifier', ''))
            else:
                json.dump(result, sys.stdout)
            sys.stdout.write('\n')
        except IOError:
            sys.exit(0)
def test_search_items():
    _j = json.loads(TEST_SEARCH_RESPONSE)
    _j['response']['numFound'] = 1
    _search_r = json.dumps(_j)
    results_url = ('{0}//archive.org/services/search/beta/scrape.php'
                   '?q=identifier%3Anasa&size=10000'.format(protocol))
    count_url = ('{0}//archive.org/services/search/beta/scrape.php'
                 '?q=identifier%3Anasa&total_only=true'.format(protocol))
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, results_url,
                 body=TEST_SCRAPE_RESPONSE,
                 match_querystring=True,
                 status=200)
        rsps.add(responses.GET, count_url,
                 body='{"items":[],"count":0,"total":1}',
                 match_querystring=True,
                 content_type='application/json; charset=UTF-8',
                 status=200)
        r = search_items('identifier:nasa')
        expected_results = [{'identifier': 'nasa'}]
        assert r.num_found == 1
        assert iter(r).search == r
        assert len(iter(r)) == 1
        assert len(r.iter_as_results()) == 1
        assert list(r) == expected_results
        assert list(r.iter_as_results()) == expected_results
Exemple #4
0
def internet_archive_download(destination_directory, collection='MBLWHOI', pdf_num=None):

    """ Uses the internetarchive Python package to stream pdf pages from a given collection
        into a provided destination_directory.
    """

    print('Beginning internet archive download...')

    for i in internetarchive.search_items('collection:' + collection):

        if pdf_num is not None:
            if i == pdf_num:
                break

        archive_id = i['identifier']
        try:
            if not os.path.exists(os.path.join(destination_directory, archive_id)):
                x = internetarchive.download(archive_id, verbose=True, glob_pattern='*.pdf', destdir=destination_directory)
            elif os.listdir(os.path.join(destination_directory, archive_id)) == []:
                x = internetarchive.download(archive_id, verbose=True, glob_pattern='*.pdf', destdir=destination_directory)
        except KeyboardInterrupt:
            print('Cancelling download.')
            break
        except:
            print('ERROR downloading', archive_id)
    return
def test_search_items():
    _j = json.loads(TEST_SEARCH_RESPONSE)
    _j['response']['numFound'] = 1
    _search_r = json.dumps(_j)
    results_url = ('{0}//archive.org/services/search/beta/scrape.php'
                   '?q=identifier%3Anasa&size=10000&REQUIRE_AUTH=true'.format(protocol))
    count_url = ('{0}//archive.org/services/search/beta/scrape.php'
                 '?q=identifier%3Anasa&total_only=true&REQUIRE_AUTH=true'
                 '&size=10000'.format(protocol))
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.POST, results_url,
                 body=TEST_SCRAPE_RESPONSE,
                 match_querystring=True,
                 status=200)
        rsps.add(responses.POST, count_url,
                 body='{"items":[],"count":0,"total":1}',
                 match_querystring=True,
                 content_type='application/json; charset=UTF-8',
                 status=200)
        r = search_items('identifier:nasa')
        expected_results = [{'identifier': 'nasa'}]
        assert r.num_found == 1
        assert iter(r).search == r
        assert len(iter(r)) == 1
        assert len(r.iter_as_results()) == 1
        assert list(r) == expected_results
        assert list(r.iter_as_results()) == expected_results
Exemple #6
0
    def readdir(self, path, fh):
        dirents = ['.', '..']
        full_path = self._full_path(path)

        # search IA.. (ls ./nasa => ?query=nasa)
        for r in search_items(full_path)iter_as_items():
            yield r
def get_sponsored_books():
    """Performs the `ia` query to fetch sponsored books from archive.org"""
    # XXX Note: This `search_items` query requires the `ia` tool (the
    # one installed via virtualenv) to be configured with (scope:all)
    # privileged s3 keys.
    items = ia.search_items(
        'collection:openlibraryscanningteam',
        fields=[
            'identifier',
            'est_book_price',
            'est_scan_price',
            'scan_price',
            'book_price',
            'repub_state',
            'imagecount',
            'title',
            'donor',
            'openlibrary_edition',
            'publicdate',
            'collection',
            'isbn',
        ],
        params={
            'page': 1,
            'rows': 1000
        },
        config={'general': {
            'secure': False
        }},
    )
    return [
        item for item in items
        if not (item.get('repub_state') == '-1'
                and item.get('donor') in BLOCKED_PATRONS)
    ]
Exemple #8
0
def test_search_items(session):
    results_url = (
        '{0}//archive.org/services/search/v1/scrape'
        '?q=identifier%3Anasa&count=10000&REQUIRE_AUTH=true'.format(PROTOCOL))
    count_url = ('{0}//archive.org/services/search/v1/scrape'
                 '?q=identifier%3Anasa&total_only=true&REQUIRE_AUTH=true'
                 '&count=10000'.format(PROTOCOL))
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.POST,
                 results_url,
                 body=TEST_SCRAPE_RESPONSE,
                 match_querystring=True)
        rsps.add(responses.POST,
                 count_url,
                 body='{"items":[],"count":0,"total":1}',
                 match_querystring=True,
                 content_type='application/json; charset=UTF-8')
        r = search_items('identifier:nasa', archive_session=session)
        expected_results = [{'identifier': 'nasa'}]
        assert r.num_found == 1
        assert iter(r).search == r
        assert len(iter(r)) == 1
        assert len(r.iter_as_results()) == 1
        assert list(r) == expected_results
        assert list(r.iter_as_results()) == expected_results
def main(argv):
    args = docopt(__doc__, argv=argv)

    params = dict(p.split(':') for p in args['--parameters'])

    # format sort paramaters.
    if args['--sort']:
        for i, field in enumerate(args['--sort']):
            key = 'sort[{0}]'.format(i)
            params[key] = field.strip().replace(':', ' ')

    query = ' '.join(args['<query>'])
    if args['--itemlist']:
        fields = ['identifier']
    else:
        fields = args['--field']
    search = search_items(query, fields=args['--field'], params=params, v2=args['--v2'])
    if args['--number-found']:
        sys.stdout.write('{0}\n'.format(search.num_found))
        sys.exit(0)
    for result in search:
        try:
            if args['--itemlist']:
                sys.stdout.write(result.get('identifier', ''))
            else:
                json.dump(result, sys.stdout)
            sys.stdout.write('\n')
        except IOError:
            sys.exit(0)
Exemple #10
0
 def show_all(self, collection=None):
     collection = self.DEFAULT_COLLECTION
     search = ia.search_items('collection:' + collection)
     data = []
     for i in search:
         data.append(i)
     return data
Exemple #11
0
def queueSearch(search_string):
	redis_conn = Redis('redis')
	q = Queue('normal', connection=redis_conn)
	search = search_items(search_string)

	for result in search:
		print result['identifier']
		job = q.enqueue(processMetadata_v3.main, result['identifier'])
def get_collection_items(collection_id):
    search_string = 'collection:' + collection_id
    search_results = ia.search_items(search_string)
    item_objects = []
    for item in search_results.iter_as_items():
        item_objects.append(item)
    print("Items found in collection: %d" %(len(item_objects)))
    return item_objects
Exemple #13
0
def append_acct_meta(uploader_email, add_subject):
    # get a list of all items made by the uploader
    print("Searching for all items...")
    search = search_items('uploader:%s' % uploader_email)

    # add subjects to each item in search results
    for result in search:
        append_meta(result['identifier'], add_subject)
Exemple #14
0
def test_search_items_as_items(session):
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.POST,
                 '{0}//archive.org/services/search/v1/scrape'.format(PROTOCOL),
                 body=TEST_SCRAPE_RESPONSE)
        rsps.add_metadata_mock('nasa')
        r = search_items('identifier:nasa', archive_session=session)
        assert [x.identifier for x in r.iter_as_items()] == ['nasa']
        assert r.iter_as_items().search == r
Exemple #15
0
def lookup_case_insensitive_identifier(identifier):
    """Perform a case-insensitive lookup"""
    params = dict(page=1)
    search_results = internetarchive.search_items('identifier:' + identifier,
                                                  params=params, config=config)
    ids = [ r['identifier']  for r in search_results ]
    if len(ids) > 0:
        identifier = ids[0]
    return identifier
def test_search_items_as_items():
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.POST,
                 '{0}//archive.org/services/search/v1/scrape'.format(PROTOCOL),
                 body=TEST_SCRAPE_RESPONSE)
        rsps.add_metadata_mock('nasa')
        r = search_items('identifier:nasa')
        assert [x.identifier for x in r.iter_as_items()] == ['nasa']
        assert r.iter_as_items().search == r
Exemple #17
0
def lists_ia_collection(collection):
    '''
    :param collection: Name of Collection to search taken from ARGS
    :return: list, result of IA.api search for collections of given name
    '''
    archive_search = internetarchive.search_items(collection)
    if (archive_search.num_found < 1):
        raise Exception("No Collections found named {}".format(collection))
    else:
         return archive_search
Exemple #18
0
def _search_collection(collection_name):
    """ Searches the internet archive for the specified collection.
    if no items are found for the collection it returns None otherwise
    the Search object is returned.
    """
    collection = internetarchive.search_items('collection:{}'.format(collection_name))
    if collection.num_found == 0:
        return None
    else:
        return collection
Exemple #19
0
def random_mixtape():
    '''Return a random mixtape item'''

    mixtapes = []

    for i in internetarchive.search_items('collection:hiphopmixtapes'):
        mixtapes.append(i['identifier'])

    # select a random mixtape
    mixtape = mixtapes[randint(0, len(mixtapes) - 1)]
    return mixtape
Exemple #20
0
def random_mixtape():
    '''Return a random mixtape item'''
    
    mixtapes=[]

    for i in internetarchive.search_items('collection:hiphopmixtapes'):
        mixtapes.append(i['identifier'])

    # select a random mixtape
    mixtape = mixtapes[randint(0,len(mixtapes) - 1)]
    return mixtape
Exemple #21
0
def test_search_items_with_fields():
    _j = json.loads(TEST_SCRAPE_RESPONSE)
    _j['items'] = [{'identifier': 'nasa', 'title': 'NASA Images'}]
    search_response_str = json.dumps(_j)
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/services/search/beta/scrape.php'.format(
                     protocol),
                 body=search_response_str,
                 status=200)
        r = search_items('identifier:nasa', fields=['identifier', 'title'])
        assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}]
Exemple #22
0
def test_page_row_specification():
    _j = json.loads(TEST_SEARCH_RESPONSE)
    _j["response"]["docs"] = [{"identifier": "nasa"}]
    _j["response"]["numFound"] = 1
    _search_r = json.dumps(_j)
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, "{0}//archive.org/advancedsearch.php".format(protocol), body=_search_r, status=200)
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        r = search_items("identifier:nasa", params={"page": "1", "rows": "1"})
        assert [x.identifier for x in r.iter_as_items()] == ["nasa"]
        assert r.iter_as_items().search == r
        assert len(r.iter_as_items()) == 1
def test_search_items_as_items():
    search_response_str = json.dumps(SEARCH_RESPONSE)
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol),
                 body=search_response_str,
                 status=200)
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        r = search_items('identifier:nasa')
        assert [x.identifier for x in r.iter_as_items()] == ['nasa']
        assert r.iter_as_items().search == r
Exemple #24
0
def test_search_items_with_fields():
    _j = json.loads(TEST_SCRAPE_RESPONSE)
    _j['items'] = [
        {'identifier': 'nasa', 'title': 'NASA Images'}
    ]
    search_response_str = json.dumps(_j)
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/services/search/beta/scrape.php'.format(protocol),
                 body=search_response_str,
                 status=200)
        r = search_items('identifier:nasa', fields=['identifier', 'title'])
        assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}]
Exemple #25
0
def test_search_items_as_items():
    search_response_str = json.dumps(TEST_SCRAPE_RESPONSE)
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(
            responses.GET,
            "{0}//archive.org/services/search/beta/scrape.php".format(protocol),
            body=TEST_SCRAPE_RESPONSE,
            status=200,
        )
        rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200)
        r = search_items("identifier:nasa")
        assert [x.identifier for x in r.iter_as_items()] == ["nasa"]
        assert r.iter_as_items().search == r
Exemple #26
0
def test_search_items_with_fields():
    _j = json.loads(TEST_SCRAPE_RESPONSE)
    _j["items"] = [{"identifier": "nasa", "title": "NASA Images"}]
    search_response_str = json.dumps(_j)
    with responses.RequestsMock() as rsps:
        rsps.add(
            responses.GET,
            "{0}//archive.org/services/search/beta/scrape.php".format(protocol),
            body=search_response_str,
            status=200,
        )
        r = search_items("identifier:nasa", fields=["identifier", "title"])
        assert list(r) == [{"identifier": "nasa", "title": "NASA Images"}]
Exemple #27
0
def get_book_items(query, rows=100, page=1, scope_all=False):
    """
    :param str query: an search query for selecting/faceting books
    :param int rows: limit how many results returned
    :param int page: starting page to offset search results
    :return: An `internetarchive` Item
    :rtype: `internetarchive` Item
    """
    params = {'page': page, 'rows': rows}
    if scope_all:
        params['scope'] = 'all'
    # this may need to get run as a session (priv'd access)
    return ia.search_items(query, params=params).iter_as_items()
 def get_collection_ids(self,
                        collection=DEFAULT_COLLECTION,
                        iter_as_items=False):
     # search
     idl = []
     search = search_items('collection:' + collection)
     total = search.num_found
     if iter_as_items: search = search.iter_as_items()
     print(f'>> [{self.name}] scanning', total,
           f'items in collection {collection}')
     # loop
     for i, result in enumerate(tqdm(search, total=total)):
         yield result['identifier'] if not iter_as_items else result
Exemple #29
0
def get_metadata(query: str, output: str):
    for i in search_items(query):
        id = i["identifier"]
        if output != None:
            os.makedirs(output, exist_ok=True)
            contents = (urllib.request.urlopen(
                f'https://www.archive.org/metadata/{i["identifier"]}').read().
                        decode("utf-8"))
            out_file = os.path.join(output, f"{id}.json")

            o = open(out_file, "w+")
            print(f"Writing {out_file}...")
            o.write(contents)
            o.close()
def test_search_items_with_fields():
    search_r = deepcopy(SEARCH_RESPONSE)
    search_r['response']['docs'] = [
        {'identifier': 'nasa', 'title': 'NASA Images'}
    ]
    search_response_str = json.dumps(search_r)
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol),
                 body=search_response_str,
                 status=200)
        r = search_items('identifier:nasa', fields=['identifier', 'title'])
        assert r.num_found == 1
        assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}]
Exemple #31
0
def sync_completed_sponsored_books(dryrun=False):
    """Retrieves a list of all completed sponsored books from Archive.org
    so they can be synced with Open Library, which entails:

    - adding IA ocaid into openlibrary edition
    - alerting patrons (if possible) by email of completion
    - possibly marking archive.org item status as complete/synced

    XXX Note: This `search_items` query requires the `ia` tool (the
    one installed via virtualenv) to be configured with (scope:all)
    privileged s3 keys.
    """
    items = ia.search_items(
        'collection:openlibraryscanningteam AND collection:inlibrary',
        fields=['identifier', 'openlibrary_edition'],
        params={
            'page': 1,
            'rows': 1000,
            'scope': 'all'
        },
        config={'general': {
            'secure': False
        }},
    )
    books = web.ctx.site.get_many([
        '/books/%s' % i.get('openlibrary_edition') for i in items
        if i.get('openlibrary_edition')
    ])
    unsynced = [book for book in books if not book.ocaid]
    ocaid_lookup = {
        '/books/%s' % i.get('openlibrary_edition'): i.get('identifier')
        for i in items
    }
    fixed = []
    for book in unsynced:
        book.ocaid = ocaid_lookup[book.key]
        with accounts.RunAs('ImportBot'):
            if not dryrun:
                web.ctx.site.save(book.dict(),
                                  "Adding ocaid for completed sponsorship")
            fixed.append({'key': book.key, 'ocaid': book.ocaid})
            # TODO: send out an email?... Requires Civi.
            if book.ocaid.startswith("isbn_"):
                isbn = book.ocaid.split("_")[-1]
                sponsorship = get_sponsorship_by_isbn(isbn)
                contact = sponsorship and sponsorship.get("contact")
                email = contact and contact.get("email")
                if not dryrun and email:
                    email_sponsor(email, book)
    return json.dumps(fixed)
Exemple #32
0
def get_from_ia(reporter, volume):
    """
    Download cases from internet archive via case law and write them to
    disk.

    :param reporter: (str) Requires a reporter abbreviation to identify
    cases to download as used by IA.  (Ex. T.C. => tc)
    :param volume: (int) Specific volume number of the reporter.  If blank
    function will cycle through all volumes of the reporter on IA.
    :return: None
    """

    reporter_key = ".".join(['law.free.cap', reporter])

    # Checks that the returned reporter is the requested one.
    # Ex. searching for Mich will return both Mich-app. and Mich.
    for ia_identifier in search_items(reporter_key):
        ia_key = ia_identifier['identifier']
        if ia_key.split(".")[3] != reporter:
            continue

        # Checks if we requested a specific volume of the
        # reporter and if so skips all other volumes of that reporter
        ia_volume = ia_key.split(".")[-1]
        if volume is not None:
            if volume != ia_volume:
                continue

        for item in get_files(ia_key):
            if "json.json" in item.name:
                continue

            if "json" in item.name:
                url = "https://archive.org/download/%s/%s" % (
                    ia_key, item.name)
                file_path = os.path.join(settings.MEDIA_ROOT,
                                         'harvard_corpus',
                                         '%s' % ia_key,
                                         '%s' % item.name,
                                         )
                directory = file_path.rsplit("/", 1)[0]
                if os.path.exists(file_path):
                    logger.info("Already captured: %s", url)
                    continue

                logger.info("Capturing: %s", url)
                mkdir_p(directory)
                data = requests.get(url, timeout=10).json()
                with open(file_path, 'w') as outfile:
                    json.dump(data, outfile, indent=2)
Exemple #33
0
def test_search_items_as_items():
    search_response_str = json.dumps(TEST_SCRAPE_RESPONSE)
    with responses.RequestsMock(
            assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/services/search/beta/scrape.php'.format(protocol),
                 body=TEST_SCRAPE_RESPONSE,
                 status=200)
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        r = search_items('identifier:nasa')
        assert [x.identifier for x in r.iter_as_items()] == ['nasa']
        assert r.iter_as_items().search == r
 def run_indexing(self):
     req = internetarchive.search_items('collection:archiveteam_newssites',
                                        config_file='account.ini',
                                        config={'general':{'secure':False}})
     if not req:
         return
     r = [s['identifier'] for s in req]
     for item in r:
         if item in self.indexed:
             continue
         if len(item.split('_')[-1]) >= 14 and item.count('_') == 2:
             if not item in self.items:
                 self.items[item] = Item(item)
                 self.items[item].run()
     del self.indexed
Exemple #35
0
def get_sponsored_books():
    """Performs the `ia` query to fetch sponsored books from archive.org"""
    from internetarchive import search_items
    params = {'page': 1, 'rows': 1000, 'scope': 'all'}
    fields = ['identifier','est_book_price','est_scan_price', 'scan_price',
              'book_price', 'repub_state', 'imagecount', 'title', 'donor',
              'openlibrary_edition', 'publicdate', 'collection', 'isbn']

    q = 'collection:openlibraryscanningteam'

    # XXX Note: This `search_items` query requires the `ia` tool (the
    # one installed via virtualenv) to be configured with (scope:all)
    # privileged s3 keys.
    config = {'general': {'secure': False}}
    return search_items(q, fields=fields, params=params, config=config)
def test_search_items():
    search_response_str = json.dumps(SEARCH_RESPONSE)
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol),
                 body=search_response_str,
                 status=200)
        r = search_items('identifier:nasa')
        expected_results = [{'identifier': 'nasa'}]
        assert r.num_found == 1
        assert len(r) == 1
        assert iter(r).search == r
        assert len(iter(r)) == 1
        assert len(r.iter_as_results()) == 1
        assert list(r) == expected_results
        assert list(r.iter_as_results()) == expected_results
Exemple #37
0
def fetch():
    error_log = open(ERRLOG ,"a")
    errors = 0
    collection = sys.argv[1]
    search = internetarchive.search_items("collection:"+collection)
    for result in search:
        itemid = result["identifier"]
        item = internetarchive.get_item(itemid)
        try:
            item.download(destdir=COLLECTIONFOLDER)
        except Exception as e:
            error_log.write("Could not download "+itemid+" because of error: %s\n" % e)
            errors+=1
            print "There was an error; writing to log."
        else:
            time.sleep(1)
Exemple #38
0
 def upload_collection(self):
     self.logger.info("Searching the Internet Archive...")
     results = search_items(query='collection:' + self.collection_name, fields=["identifier"])
     number_items = results.num_found
     self.logger.info(str(number_items) + " items found in collection '" + str(self.collection_name) + "'.")
     metadata = []
     self.logger.info("Starting download...")
     for i,s in enumerate(results):
         identifier = s.get("identifier")
         item = get_item(identifier)
         m = self.get_metadata(item)
         if not m:
             continue
         r = self.download_item(m)
         p = self.uploader.upload_claim(m)
     return True
def update_archive_books():
    params = {'mediatype': 'texts'}
    fields = (
        'creator', 'contributor', 'date', 'description', 'genre', 'language',
        'name', 'publisher',
        'source', 'scanningcenter', 'title', 'subject', 'volume',
    )
    fields = ()
    query = 'language:Telugu and mediatype:texts'
    query = 'mediatype:texts and languageSorter:Telugu'
    query = 'languageSorter:Telugu'
    data = []

    try:
        df = pd.read_csv('data/ia.csv', index_col=['identifier'])
    except FileNotFoundError:
        print('Creating new file')
        df = pd.DataFrame()

    cdf = df

    for index, item in enumerate(ia.search_items(query=query)):
        print(index, item)
        pk = item['identifier']
        if pk in df.index:
            continue
        item = ia.get_item(pk)
        metadata = item.item_metadata.get('metadata', {'item_url': ''})
        metadata['item_url'] = item.urls.details
        print(metadata['item_url'])
        data.append(metadata)

        # import ipdb; ipdb.set_trace()

        if index % 5 == 0:
            dfo = pd.read_csv('data/ia.csv', index_col=['identifier'])
            df = pd.DataFrame(data)
            df.set_index('identifier', inplace=True)

            df = pd.concat([dfo, df])
            df.to_csv('data/ia.csv')

            df = pd.read_csv('data/ia.csv', index_col=['identifier'])
            df.drop_duplicates(inplace=True)
            df.to_csv('data/ia.csv')
            print(df.shape, len(df))
            print('file saved')
Exemple #40
0
def main(argv, session=None):
    args = docopt(__doc__, argv=argv)

    # Validate args.
    s = Schema({
        six.text_type:
        Use(bool),
        '<query>':
        Use(lambda x: ' '.join(x)),
        '--parameters':
        Use(lambda x: get_args_dict(x, query_string=True)),
        '--sort':
        list,
        '--field':
        Use(lambda x: ['identifier'] if not x and args['--itemlist'] else x),
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)),
              file=sys.stderr)
        sys.exit(1)

    # Support comma separated values.
    fields = list(chain.from_iterable([x.split(',') for x in args['--field']]))
    sorts = list(chain.from_iterable([x.split(',') for x in args['--sort']]))

    search = search_items(args['<query>'],
                          fields=fields,
                          sorts=sorts,
                          params=args['--parameters'])

    if args['--num-found']:
        print('{0}'.format(search.num_found))
        sys.exit(0)

    try:
        for result in search:
            if args['--itemlist']:
                print(result.get('identifier', ''))
            else:
                j = json.dumps(result)
                print(j)
    except ValueError as e:
        print('error: {0}'.format(e), file=sys.stderr)
def test_page_row_specification():
    _j = json.loads(TEST_SEARCH_RESPONSE)
    _j['response']['docs'] = [{'identifier': 'nasa'}]
    _j['response']['numFound'] = 1
    _search_r = json.dumps(_j)
    with IaRequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(PROTOCOL),
                 body=_search_r)
        rsps.add_metadata_mock('nasa')
        rsps.add(responses.POST, 'https://archive.org/services/search/v1/scrape',
                 body='{"items":[],"count":0,"total":1}',
                 match_querystring=False,
                 content_type='application/json; charset=UTF-8')
        r = search_items('identifier:nasa', params={
            'page': '1', 'rows': '1'})
        assert [x.identifier for x in r.iter_as_items()] == ['nasa']
        assert r.iter_as_items().search == r
        assert len(r.iter_as_items()) == 1
Exemple #42
0
def getDaySubjectsAll(query):
    """ Attempt to add subjects based on the most frequent n-grams in the items returned by the query """
    subjects = {}
    allsubjects = []

    identifiers = [item['identifier'] for item in search_items(query)]
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for item, itemsubjects in zip(
                identifiers, executor.map(getDaySubjects, identifiers)):
            subjects[item] = itemsubjects
    for key, value in subjects.items():
        allsubjects += value
    frequentsubjects = [
        subject[0]
        for subject in collections.Counter(allsubjects).most_common(100)
    ]
    print("INFO: Would discard the following frequent subjects: {}".format(
        "; ".join(frequentsubjects)))
def main():
	# usage statement
	if(len(sys.argv) < 3):
		print("Append new subjects to all Internet Archive items uploaded by the current user.")
		print("Usage: %s '*****@*****.**' 'subject1;subject2;'" % (sys.argv[0]))
		sys.exit(1)
	else:
		# parameters
		uploader_email = sys.argv[1]
		add_subject = sys.argv[2]
		
	# get a list of all items made by the uploader
	print("Searching for all items...")
	search = search_items('uploader:%s' % uploader_email)
	
	# add subjects to each item in search results
	for result in search:
		append_meta(result['identifier'], add_subject)
Exemple #44
0
def test_page_row_specification():
    _j = json.loads(TEST_SEARCH_RESPONSE)
    _j['response']['docs'] = [{'identifier': 'nasa'}]
    _j['response']['numFound'] = 1
    _search_r = json.dumps(_j)
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/advancedsearch.php'.format(protocol),
                 body=_search_r,
                 status=200)
        rsps.add(responses.GET,
                 '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        r = search_items('identifier:nasa', params={'page': '1', 'rows': '1'})
        assert [x.identifier for x in r.iter_as_items()] == ['nasa']
        assert r.iter_as_items().search == r
        assert len(r.iter_as_items()) == 1
Exemple #45
0
def find_last_months_dumps_on_ia(yyyy_mm: str = yyyy_mm) -> bool:
    """
    Return True if both ol_dump_yyyy_mm and ol_cdump_yyyy_mm files
    have been saved on Internet Archive collection:ol_exports.

    >>> next_month = date.today().replace(day=1) + timedelta(days=31)
    >>> find_last_months_dumps_on_ia(f"{next_month:%Y-%m}")
    False
    """
    prefixes = {f"ol_dump_{yyyy_mm}": 0, f"ol_cdump_{yyyy_mm}": 0}
    # print(prefixes)
    for item in search_items("collection:ol_exports"):
        for prefix in prefixes:
            if item["identifier"].startswith(prefix):
                prefixes[prefix] += 1
                # Is there at least one item id starting with each prefix?
                if files_with_both_prefixes_found := all(prefixes.values()):
                    return files_with_both_prefixes_found
Exemple #46
0
def test_page_row_specification(session):
    _j = json.loads(TEST_SEARCH_RESPONSE)
    _j['response']['items'] = [{'identifier': 'nasa'}]
    _j['response']['numFound'] = 1
    _search_r = json.dumps(_j)
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(PROTOCOL),
                 body=_search_r)
        rsps.add_metadata_mock('nasa')
        rsps.add(responses.POST,
                 '{0}//archive.org/services/search/v1/scrape'.format(PROTOCOL),
                 body='{"items":[],"count":0,"total":1}',
                 match_querystring=False,
                 content_type='application/json; charset=UTF-8')
        r = search_items('identifier:nasa', params={'page': '1', 'rows': '1'},
                         archive_session=session)
        assert r.iter_as_items().search == r
        assert len(r.iter_as_items()) == 1
Exemple #47
0
def query_catalog(args_from_ui):
    '''
    Given the dictonary args_from_ui, query the catalog of radio-aporee-maps at
    the Internet Archive (archive.org).

    Inputs:
        args_from_ui: dict of keywords provided from the ui
    Returns:
        query_results: Search object containing results of the query
    '''
    
    query = 'collection:radio-aporee-maps '

    for arg in args_from_ui:
        keyword = args_from_ui[arg]
        query = query + arg + ':' + keyword + ' '

    query_results = ia.search_items(query)
    return query_results
Exemple #48
0
 def find_films(cls,search_term="collection:(Feature_Films) AND mediatype:(movies)"):
     """
     download films and process
     """
     films = search_items(search_term)
     
     films = [x['identifier'] for x in films]
     
     random_group = random.sample(films,100)
     
     for t in random_group:
         print t
         try:
             fi = cls(t)
         except Exception:
             print "failure"
             fi = None
         if fi and fi.failed == False:
             fi.create_gifs()
Exemple #49
0
def test_search_items():
    _j = json.loads(TEST_SEARCH_RESPONSE)
    _j["response"]["numFound"] = 1
    _search_r = json.dumps(_j)
    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(
            responses.GET,
            "{0}//archive.org/services/search/beta/scrape.php".format(protocol),
            body=TEST_SCRAPE_RESPONSE,
            status=200,
        )
        rsps.add(responses.GET, "{0}//archive.org/advancedsearch.php".format(protocol), body=_search_r, status=200)
        r = search_items("identifier:nasa")
        expected_results = [{"identifier": "nasa"}]
        assert r.num_found == 1
        assert iter(r).search == r
        assert len(iter(r)) == 1
        assert len(r.iter_as_results()) == 1
        assert list(r) == expected_results
        assert list(r.iter_as_results()) == expected_results
def test_page_row_specification():
    _j = json.loads(TEST_SEARCH_RESPONSE)
    _j['response']['docs'] = [{'identifier': 'nasa'}]
    _j['response']['numFound'] = 1
    _search_r = json.dumps(_j)
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol),
                 body=_search_r,
                 status=200)
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        rsps.add(responses.POST, 'https://archive.org/services/search/beta/scrape.php',
                 body='{"items":[],"count":0,"total":1}',
                 match_querystring=False,
                 content_type='application/json; charset=UTF-8',
                 status=200)
        r = search_items('identifier:nasa', params={
                         'page': '1', 'rows': '1'})
        assert [x.identifier for x in r.iter_as_items()] == ['nasa']
        assert r.iter_as_items().search == r
        assert len(r.iter_as_items()) == 1
Exemple #51
0
def test_search_items(session):
    results_url = ('{0}//archive.org/services/search/v1/scrape'
                   '?q=identifier%3Anasa&count=10000&REQUIRE_AUTH=true'.format(PROTOCOL))
    count_url = ('{0}//archive.org/services/search/v1/scrape'
                 '?q=identifier%3Anasa&total_only=true&REQUIRE_AUTH=true'
                 '&count=10000'.format(PROTOCOL))
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.POST, results_url,
                 body=TEST_SCRAPE_RESPONSE,
                 match_querystring=True)
        rsps.add(responses.POST, count_url,
                 body='{"items":[],"count":0,"total":1}',
                 match_querystring=True,
                 content_type='application/json; charset=UTF-8')
        r = search_items('identifier:nasa', archive_session=session)
        expected_results = [{'identifier': 'nasa'}]
        assert r.num_found == 1
        assert iter(r).search == r
        assert len(iter(r)) == 1
        assert len(r.iter_as_results()) == 1
        assert list(r) == expected_results
        assert list(r.iter_as_results()) == expected_results
Exemple #52
0
def main(argv, session=None):
    args = docopt(__doc__, argv=argv)

    # Validate args.
    s = Schema({
        six.text_type: Use(bool),
        '<query>': Use(lambda x: ' '.join(x)),
        '--parameters': Use(lambda x: get_args_dict(x, query_string=True)),
        '--sort': list,
        '--field': Use(lambda x: ['identifier'] if not x and args['--itemlist'] else x),
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr)
        sys.exit(1)

    # Support comma separated values.
    fields = list(chain.from_iterable([x.split(',') for x in args['--field']]))
    sorts = list(chain.from_iterable([x.split(',') for x in args['--sort']]))

    search = search_items(args['<query>'],
                          fields=fields,
                          sorts=sorts,
                          params=args['--parameters'])

    if args['--num-found']:
        print('{0}'.format(search.num_found))
        sys.exit(0)

    try:
        for result in search:
            if args['--itemlist']:
                print(result.get('identifier', ''))
            else:
                j = json.dumps(result)
                print(j)
    except ValueError as e:
        print('error: {0}'.format(e), file=sys.stderr)
Exemple #53
0
def main(argv):
    args = docopt(__doc__, argv=argv)

    params = dict(p.split(':') for p in args['--parameters'])

    if args['--sort']:
        if not isinstance(args['--sort'], list):
            args['--sort'] = [args['--sort']]
        for i, field in enumerate(args['--sort']):
            key = 'sort[{0}]'.format(i)
            params[key] = field.strip().replace(':', ' ')

    fields = ['identifier'] + args['--field']

    query = ' '.join(args['<query>'])
    search_resp = search_items(query, fields=fields, params=params)
    if args['--number-found']:
        sys.stdout.write('{0}\n'.format(search_resp.num_found))
        sys.exit(0)
    for result in search_resp:
        output = '\t'.join([result.get(f, '') for f in fields]).encode('utf-8')
        sys.stdout.write(output + '\n')
Exemple #54
0
def test_search_items_with_fields(session):
    _j = json.loads(TEST_SCRAPE_RESPONSE)
    _j['items'] = [
        {'identifier': 'nasa', 'title': 'NASA Images'}
    ]
    search_response_str = json.dumps(_j)
    results_url = ('{0}//archive.org/services/search/v1/scrape'
                   '?q=identifier%3Anasa&count=10000&REQUIRE_AUTH=true'
                   '&fields=identifier%2Ctitle'.format(PROTOCOL))
    count_url = ('{0}//archive.org/services/search/v1/scrape'
                 '?q=identifier%3Anasa&total_only=true&REQUIRE_AUTH=true'
                 '&count=10000'.format(PROTOCOL))
    with IaRequestsMock() as rsps:
        rsps.add(responses.POST, results_url,
                 match_querystring=True,
                 body=search_response_str)
        rsps.add(responses.POST, count_url,
                 body='{"items":[],"count":0,"total":1}',
                 match_querystring=True,
                 content_type='application/json; charset=UTF-8')
        r = search_items('identifier:nasa', fields=['identifier', 'title'],
                         archive_session=session)
        assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}]
def main(argv, session=None):
    args = docopt(__doc__, argv=argv)

    # Validate args.
    s = Schema({
        six.text_type: Use(bool),
        '<query>': Use(lambda x: ' '.join(x)),
        '--parameters': Use(lambda x: get_args_dict(x)),
        '--sort': list,
        '--field': Use(lambda x: ['identifier'] if not x and args['--itemlist'] else x),
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr)
        sys.exit(1)

    # Format sort paramaters.
    for i, field in enumerate(args['--sort']):
        key = 'sort[{0}]'.format(i)
        args['--parameters'][key] = field.strip().replace(':', ' ')

    search = search_items(args['<query>'],
                          fields=args['--field'],
                          params=args['--parameters'])

    if args['--num-found']:
        print('{0}'.format(search.num_found))
        sys.exit(0)

    for result in search:
        if args['--itemlist']:
            print(result.get('identifier', ''))
        else:
            j = json.dumps(result)
            print(j)
Exemple #56
0
def archive_search(qry):
    return search_items(qry, config_file=str(basedir / 'ia.ini'))
Exemple #57
0
#!/usr/bin/env python

import os
import json
import ptree

from internetarchive import search_items, Item

total_bytes = 0

for result in search_items('collection:usda-nurseryandseedcatalog'):
    id = result['identifier']
    item = Item(id)
    metadata = item.get_metadata()
    item_dir = os.path.join('items', ptree.id2ptree(id).lstrip("/"))
    if not os.path.isdir(item_dir):
        os.makedirs(item_dir)
    with open(os.path.join(item_dir, 'metadata.json'), 'w') as fh:
        fh.write(json.dumps(metadata, indent=2))

    total_bytes += sum([f.size for f in item.iter_files()])
    print item_dir

print total_bytes
Exemple #58
0
import time
from internetarchive import search_items

search = search_items('mediatype:collection AND collection:etree', fields=['identifier','collection'])


for result in search:
	print result['identifier'], result['collection'][0]
	

from rq import Queue
from redis import Redis
# from processMetadata import ImportShow
import time
import processMetadata_v3
from internetarchive import search_items



# sstring = 'mediatype:etree AND creator:"Blue Turtle Seduction"'
sstring = 'mediatype:collection AND collection:etree'
search = search_items(sstring)


redis_conn = Redis('redis')
q = Queue(connection=redis_conn)

for result in search:
	print result['identifier']
	job = q.enqueue(processMetadata_v3.main, result['identifier'])
		
def main(argv, session):
    args = docopt(__doc__, argv=argv)

    # Validation error messages.
    destdir_msg = '--destdir must be a valid path to a directory.'

    # Validate args.
    s = Schema({
        str: Use(bool),
        '--destdir': Or([], And(Use(lambda d: d[0]), dir_exists), error=destdir_msg),
        '--format': list,
        '--glob': Use(lambda l: l[0] if l else None),
        '<file>': list,
        '--search': Or(str, None),
        '--itemlist': Or(str, None),
        '<identifier>': Or(str, None),
        '--retries': Use(lambda x: x[0]),
    })

    # Filenames should be unicode literals. Support PY2 and PY3.
    if six.PY2:
        args['<file>'] = [f.decode('utf-8') for f in args['<file>']]

    try:
        args = s.validate(args)
    except SchemaError as exc:
        sys.stderr.write('{0}\n{1}\n'.format(
            str(exc), printable_usage(__doc__)))
        sys.exit(1)

    retries = int(args['--retries'])

    if args['--itemlist']:
        ids = [x.strip() for x in open(args['--itemlist'])]
        total_ids = len(ids)
    elif args['--search']:
        _search = search_items(args['--search'])
        total_ids = _search.num_found
        ids = search_ids(args['--search'])

    # Download specific files.
    if args['<identifier>']:
        if '/' in args['<identifier>']:
            identifier = args['<identifier>'].split('/')[0]
            files = ['/'.join(args['<identifier>'].split('/')[1:])]
        else:
            identifier = args['<identifier>']
            files = args['<file>']
        total_ids = 1
        ids = [identifier]
    else:
        files = None

    errors = list()
    for i, identifier in enumerate(ids):
        if total_ids > 1:
            item_index = '{0}/{1}'.format((i + 1), total_ids)
        else:
            item_index = None

        try:
            item = session.get_item(identifier)
        except Exception as exc:
            print('{0}: failed to retrieve item metadata - errors'.format(identifier))
            continue

        # Otherwise, download the entire item.
        _errors = item.download(
            files=files,
            formats=args['--format'],
            glob_pattern=args['--glob'],
            dry_run=args['--dry-run'],
            verbose=args['--verbose'],
            silent=args['--silent'],
            ignore_existing=args['--ignore-existing'],
            checksum=args['--checksum'],
            destdir=args['--destdir'],
            no_directory=args['--no-directories'],
            retries=retries,
            item_index=item_index,
            ignore_errors=True
        )
        if _errors:
            errors.append(_errors)
    if errors:
        # TODO: add option for a summary/report.
        sys.exit(1)
    else:
        sys.exit(0)