def main(): """ Run the process on the given asset where both the process and the asset id are given on the command line. """ try: name = sys.argv[1] asset_id = sys.argv[2] is_new = int(sys.argv[3]) != 0 work_item = { 'Process-Name' : name, 'Asset-ID' : asset_id, 'Is-New' : is_new, } module = driver.init_module(name) processor = driver.init_processor(module) try: work_item.update(operations.instantiate_asset(asset_id)) except models.Asset.DoesNotExist: logging.error('Asset no longer exists: %s' % asset_id) except S3ResponseError, error: if error.status == 404: logging.error('Could not find asset in S3: %s' % asset_id) else: logging.exception('Unexpected error!') raise else:
def message_as_json_dict(message, include_body = True): """ Serializes message metadata and (optionnaly) message body into JSON. Modified date is returned as milliseconds """ json = { 'guid' : message.pk, 'subject' : message.subject, 'sender_address' : message.sender_address.email, 'date' : message.date.ctime(), # modified date in milliseconds since epoch in UTC 'modified_date' : '%.0f' % (time.mktime(message.modified_date.timetuple())*1000000 + message.modified_date.microsecond) } if include_body: body_type = models.MimeType.HTML message_asset = message.get_asset(models.AssetClass.MESSAGE_PART, body_type) if not message_asset: body_type = models.MimeType.TEXT message_asset = message.get_asset(models.AssetClass.MESSAGE_PART, body_type) if message_asset: message_file = instantiate_asset(message_asset)['Local-Path'] message_text = open(message_file,'r').read() #TODO: figure out why a message has some non-ascii chars # i need to call decode here message_text = message_text.decode('utf8','ignore') #TODO: extract the message from a thread. # This will be much more complex than looking for -----Original Message----- if body_type == models.MimeType.TEXT: json.update(body = message_text.split('-----Original Message-----')[0]) else: json.update(body = message_text) json.update(body_type = body_type) return json
def _draw_page_list(page_list, output_buffer=None, username=None, title=None, view_type=None): """ Draw a list of pages into a pdf file. """ if output_buffer is None: output_buffer = StringIO() canvas = Canvas(output_buffer) if username is not None: canvas.setAuthor(username) if title is not None: canvas.setTitle(title) if view_type is None: view_type = AssetClass.PAGE_IMAGE view_asset_class = AssetClass.objects.get(name=view_type) text_asset_class = AssetClass.objects.get(name=AssetClass.PAGE_TEXT) for page in page_list: # For each page, get S3 URL for an image and HTML representation # extract text from the HTML # put image and text into PDF canvas # NB Image.open seems to only work with a file (not a stream) # so we have to create (and delete) a temporary file that # holds the image and the text image_stream = StringIO() image_asset = page.get_asset(view_asset_class) image_file = instantiate_asset(image_asset)['Local-Path'] image = Image.open(image_file) text_asset = page.get_asset(text_asset_class) text_file = instantiate_asset(text_asset)['Local-Path'] text = open(text_file, 'r').read() image_width, image_heigth, text_fragments = misc.extract_text_from_hocr( text) w = (image_width / DPI) * inch h = (image_heigth / DPI) * inch rw = w / image_width rh = h / image_heigth for fragment in text_fragments: canvas.drawString(rw * fragment['x'], h - rh * fragment['y'], fragment['text']) canvas.drawInlineImage(image, 0, 0, w, h) canvas.setPageSize((w, h)) canvas.showPage() os.remove(image_file) os.remove(text_file) canvas.save() return output_buffer
def _draw_page_list(page_list, output_buffer = None, username = None, title = None, view_type = None): """ Draw a list of pages into a pdf file. """ if output_buffer is None: output_buffer = StringIO() canvas = Canvas(output_buffer) if username is not None: canvas.setAuthor(username) if title is not None: canvas.setTitle(title) if view_type is None: view_type = AssetClass.PAGE_IMAGE view_asset_class = AssetClass.objects.get(name = view_type) text_asset_class = AssetClass.objects.get(name = AssetClass.PAGE_TEXT) for page in page_list: # For each page, get S3 URL for an image and HTML representation # extract text from the HTML # put image and text into PDF canvas # NB Image.open seems to only work with a file (not a stream) # so we have to create (and delete) a temporary file that # holds the image and the text image_stream = StringIO() image_asset = page.get_asset(view_asset_class) image_file = instantiate_asset(image_asset)['Local-Path'] image = Image.open(image_file) text_asset = page.get_asset(text_asset_class) text_file = instantiate_asset(text_asset)['Local-Path'] text = open(text_file,'r').read() image_width, image_heigth, text_fragments = misc.extract_text_from_hocr(text) w = (image_width/DPI)*inch h = (image_heigth/DPI)*inch rw = w/image_width rh = h/image_heigth for fragment in text_fragments: canvas.drawString(rw * fragment['x'], h - rh *fragment['y'], fragment['text']) canvas.drawInlineImage( image, 0, 0, w, h) canvas.setPageSize((w,h)) canvas.showPage() os.remove(image_file) os.remove(text_file) canvas.save() return output_buffer