def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.all(): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) reset_queries() issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
def _process_coordinates(self, page, coords): _logger.debug("writing out word coords for %s" % page.url) f = open(models.coordinates_path(page._url_parts()), "w") f.write(gzip_compress(json.dumps(coords))) f.close()
def coordinates(request, lccn, date, edition, sequence, words=None): url_parts = dict(lccn=lccn, date=date, edition=edition, sequence=sequence) try: file_data = gzip.open(models.coordinates_path(url_parts), 'rb') except IOError: return HttpResponse() data = json.load(file_data) non_lexemes = re.compile('''^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$|'s$''') return_coords = data.copy() # reset coords to {} and build afresh, getting rid of unwanted punctuations return_coords['coords'] = {} for key in data.get('coords'): return_coords['coords'][re.sub(non_lexemes, '', key)] = data['coords'][key] r = HttpResponse(content_type='application/json') r.write(json.dumps(return_coords)) return r