def search_naa():
    rs = RSSearchClient()
    kwargs = request.args.to_dict()
    kwargs['page'] = request.args.get('page', 1)
    kwargs['sort'] = 3
    print kwargs
    results = rs.search_names(**kwargs)
    return jsonify(results)
def search_naa():
    rs = RSSearchClient()
    kwargs = request.args.to_dict()
    kwargs['page'] = request.args.get('page', 1)
    kwargs['sort'] = 3
    print kwargs
    results = rs.search_names(**kwargs)
    return jsonify(results)
Exemple #3
0
 def __init__(self, series, control=None):
     self.series = series
     self.control = control
     self.total_pages = None
     self.pages_complete = 0
     self.client = RSSearchClient()
     self.prepare_harvest()
     db = self.get_db()
     self.items = db.items
Exemple #4
0
 def __init__(self, series, control=None):
     self.series = series
     self.control = control
     self.total_pages = None
     self.pages_complete = 0
     self.client = RSSearchClient()
     self.prepare_harvest()
     db = self.get_db()
     self.items = db.items
Exemple #5
0
class SeriesHarvester():
    def __init__(self, series, control=None):
        self.series = series
        self.control = control
        self.total_pages = None
        self.pages_complete = 0
        self.client = RSSearchClient()
        self.prepare_harvest()
        db = self.get_db()
        self.items = db.items

    def get_db(self):
        dbclient = MongoClient(MONGOLAB_URL)
        db = dbclient.get_default_database()
        # items = db.items
        # items.remove()
        return db

    def get_total(self):
        return self.client.total_results

    def get_db_total(self):
        return self.items.find({'series': self.series}).count()

    def prepare_harvest(self):
        if self.control:
            self.client.search(series=self.series, control=self.control)
        else:
            self.client.search(series=self.series)
        total_results = self.client.total_results
        print '{} items'.format(total_results)
        self.total_pages = (int(total_results) /
                            self.client.results_per_page) + 1
        print self.total_pages

    def start_harvest(self, page=None):
        if not page:
            page = self.pages_complete + 1
        while self.pages_complete < self.total_pages:
            if self.control:
                response = self.client.search(series=self.series,
                                              page=page,
                                              control=self.control)
            else:
                response = self.client.search(series=self.series,
                                              page=page,
                                              sort='9')
            self.items.insert_many(response['results'])
            self.pages_complete += 1
            page += 1
            print '{} pages complete'.format(self.pages_complete)
            time.sleep(1)

    def harvest_images(self):
        db = self.get_db()
        items = db.items.find({
            'series': self.series,
            'digitised_status': True
        })
        images = db.images
        headers = {'User-Agent': 'Mozilla/5.0'}
        for item in items:
            directory = os.path.join(
                IMAGES_DIR,
                '{}/{}-[{}]'.format(self.series.replace('/', '-'),
                                    item['control_symbol'].replace('/', '-'),
                                    item['identifier']))
            if not os.path.exists(directory):
                os.makedirs(directory)
                os.makedirs(os.path.join(directory, 'thumbs'))
            for page in range(1, item['digitised_pages'] + 1):
                filename = '{}/{}-p{}.jpg'.format(directory,
                                                  item['identifier'], page)
                print '{}, p. {}'.format(item['identifier'], page)
                if not os.path.exists(filename):
                    img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(
                        item['identifier'], page)
                    response = requests.get(img_url,
                                            headers=headers,
                                            stream=True)
                    response.raise_for_status()
                    try:
                        image = Image.open(StringIO(response.content))
                    except:
                        print 'Not an image'
                    else:
                        width, height = image.size
                        image.save(filename)
                        del response
                        image_meta = {
                            'item_id': item['_id'],
                            'identifier': item['identifier'],
                            'page': page,
                            'width': width,
                            'height': height
                        }
                        images.save(image_meta)
                        print 'Image saved'
                        for size in IMAGE_SIZES:
                            new_width, new_height = size
                            thumb_file = '{}/thumbs/{}-p{}-{}-sq.jpg'.format(
                                directory, item['identifier'], page, new_width)
                            thumb_image = ImageOps.fit(image, size,
                                                       Image.ANTIALIAS)
                            thumb_image.save(thumb_file)
                        thumb_file = '{}/thumbs/{}-p{}-200.jpg'.format(
                            directory, item['identifier'], page)
                        thumb_image = image.copy()
                        thumb_image.thumbnail((200, 200))
                        thumb_image.save(thumb_file)
                        image.close()
                        thumb_image.close()
                    time.sleep(5)
Exemple #6
0
class SeriesHarvester():
    def __init__(self, series, control=None):
        self.series = series
        self.control = control
        self.total_pages = None
        self.pages_complete = 0
        self.client = RSSearchClient()
        self.prepare_harvest()
        db = self.get_db()
        self.items = db.items

    def get_db(self):
        dbclient = MongoClient(MONGOLAB_URL)
        db = dbclient.get_default_database()
        # items = db.items
        # items.remove()
        return db

    def get_total(self):
        return self.client.total_results

    def get_db_total(self):
        return self.items.find({'series': self.series}).count()

    def prepare_harvest(self):
        if self.control:
            self.client.search(series=self.series, control=self.control)
        else:
            self.client.search(series=self.series)
        total_results = self.client.total_results
        print '{} items'.format(total_results)
        self.total_pages = (int(total_results) / self.client.results_per_page) + 1
        print self.total_pages

    def start_harvest(self, page=None):
        if not page:
            page = self.pages_complete + 1
        while self.pages_complete < self.total_pages:
            if self.control:
                response = self.client.search(series=self.series, page=page, control=self.control)
            else:
                response = self.client.search(series=self.series, page=page, sort='9')
            self.items.insert_many(response['results'])
            self.pages_complete += 1
            page += 1
            print '{} pages complete'.format(self.pages_complete)
            time.sleep(1)

    def harvest_images(self):
        db = self.get_db()
        items = db.items.find({'series': self.series, 'digitised_status': True})
        images = db.images
        headers = {'User-Agent': 'Mozilla/5.0'}
        for item in items:
            directory = os.path.join(IMAGES_DIR, '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-'), item['identifier']))
            if not os.path.exists(directory):
                os.makedirs(directory)
                os.makedirs(os.path.join(directory, 'thumbs'))
            for page in range(1, item['digitised_pages'] + 1):
                filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page)
                print '{}, p. {}'.format(item['identifier'], page)
                if not os.path.exists(filename):
                    img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page)
                    response = requests.get(img_url, headers=headers, stream=True)
                    response.raise_for_status()
                    try:
                        image = Image.open(StringIO(response.content))
                    except:
                        print 'Not an image'
                    else:
                        width, height = image.size
                        image.save(filename)
                        del response
                        image_meta = {
                            'item_id': item['_id'],
                            'identifier': item['identifier'],
                            'page': page,
                            'width': width,
                            'height': height
                            }
                        images.save(image_meta)
                        print 'Image saved'
                        for size in IMAGE_SIZES:
                            new_width, new_height = size
                            thumb_file = '{}/thumbs/{}-p{}-{}-sq.jpg'.format(directory, item['identifier'], page, new_width)
                            thumb_image = ImageOps.fit(image, size, Image.ANTIALIAS)
                            thumb_image.save(thumb_file)
                        thumb_file = '{}/thumbs/{}-p{}-200.jpg'.format(directory, item['identifier'], page)
                        thumb_image = image.copy()
                        thumb_image.thumbnail((200, 200))
                        thumb_image.save(thumb_file)
                        image.close()
                        thumb_image.close()
                    time.sleep(5)