Example #1
0
def main():
    """ Run the process on the given asset where both the process and the
        asset id are given on the command line.
    """
    try:
        name     = sys.argv[1]
        asset_id = sys.argv[2]
        is_new   = int(sys.argv[3]) != 0

        work_item = {
            'Process-Name' : name,
            'Asset-ID'     : asset_id,
            'Is-New'       : is_new,
            }

        module    = driver.init_module(name)
        processor = driver.init_processor(module)


        try:
            work_item.update(operations.instantiate_asset(asset_id))
        except models.Asset.DoesNotExist:
            logging.error('Asset no longer exists: %s' % asset_id)
        except S3ResponseError, error:
            if error.status == 404:
                logging.error('Could not find asset in S3: %s' % asset_id)
            else:
                logging.exception('Unexpected error!')
                raise
        else:
Example #2
0
def message_as_json_dict(message, include_body = True):
    """
    Serializes message metadata and (optionnaly) message body into JSON.
    
    Modified date is returned as milliseconds 
    """
    json = {
        'guid'                  : message.pk,
        'subject'               : message.subject,
        'sender_address'        : message.sender_address.email,
        'date'                  : message.date.ctime(),
        # modified date in milliseconds since epoch in UTC 
        'modified_date'         : '%.0f' % (time.mktime(message.modified_date.timetuple())*1000000 + message.modified_date.microsecond)
        }

    if include_body:
        body_type = models.MimeType.HTML
        message_asset = message.get_asset(models.AssetClass.MESSAGE_PART,
                                          body_type)
        if not message_asset:
            body_type = models.MimeType.TEXT
            message_asset = message.get_asset(models.AssetClass.MESSAGE_PART,
                                              body_type)
        if message_asset:
            message_file = instantiate_asset(message_asset)['Local-Path']
            message_text = open(message_file,'r').read()

            #TODO: figure out why a message has some non-ascii chars
            # i need to call decode here
            message_text = message_text.decode('utf8','ignore')

            #TODO: extract the message from a thread.
            # This will be much more complex than looking for -----Original Message-----
            if body_type == models.MimeType.TEXT:
                json.update(body = message_text.split('-----Original Message-----')[0])
            else:
                json.update(body = message_text)
            json.update(body_type = body_type)

    return json
Example #3
0
def _draw_page_list(page_list,
                    output_buffer=None,
                    username=None,
                    title=None,
                    view_type=None):
    """
    Draw a list of pages into a pdf file.

    """

    if output_buffer is None:
        output_buffer = StringIO()

    canvas = Canvas(output_buffer)

    if username is not None:
        canvas.setAuthor(username)

    if title is not None:
        canvas.setTitle(title)

    if view_type is None:
        view_type = AssetClass.PAGE_IMAGE

    view_asset_class = AssetClass.objects.get(name=view_type)
    text_asset_class = AssetClass.objects.get(name=AssetClass.PAGE_TEXT)

    for page in page_list:
        # For each page, get S3 URL for an image and HTML representation
        # extract text from the HTML
        # put image and text into PDF canvas
        # NB Image.open seems to only work with a file (not a stream)
        # so we have to create (and delete) a temporary file that
        # holds the image and the text
        image_stream = StringIO()
        image_asset = page.get_asset(view_asset_class)
        image_file = instantiate_asset(image_asset)['Local-Path']
        image = Image.open(image_file)

        text_asset = page.get_asset(text_asset_class)
        text_file = instantiate_asset(text_asset)['Local-Path']

        text = open(text_file, 'r').read()
        image_width, image_heigth, text_fragments = misc.extract_text_from_hocr(
            text)

        w = (image_width / DPI) * inch
        h = (image_heigth / DPI) * inch

        rw = w / image_width
        rh = h / image_heigth

        for fragment in text_fragments:
            canvas.drawString(rw * fragment['x'], h - rh * fragment['y'],
                              fragment['text'])
        canvas.drawInlineImage(image, 0, 0, w, h)
        canvas.setPageSize((w, h))
        canvas.showPage()

        os.remove(image_file)
        os.remove(text_file)

    canvas.save()

    return output_buffer
Example #4
0
def _draw_page_list(page_list,
                    output_buffer = None,
                    username = None,
                    title = None,
                    view_type = None):
    """
    Draw a list of pages into a pdf file.

    """

    if output_buffer is None:
        output_buffer = StringIO()

    canvas = Canvas(output_buffer)

    if username is not None:
        canvas.setAuthor(username)

    if title is not None:
        canvas.setTitle(title)

    if view_type is None:
        view_type = AssetClass.PAGE_IMAGE

    view_asset_class = AssetClass.objects.get(name = view_type)
    text_asset_class = AssetClass.objects.get(name = AssetClass.PAGE_TEXT)

    for page in page_list:
        # For each page, get S3 URL for an image and HTML representation
        # extract text from the HTML
        # put image and text into PDF canvas
        # NB Image.open seems to only work with a file (not a stream)
        # so we have to create (and delete) a temporary file that
        # holds the image and the text
        image_stream = StringIO()
        image_asset = page.get_asset(view_asset_class)
        image_file = instantiate_asset(image_asset)['Local-Path']
        image = Image.open(image_file)

        text_asset = page.get_asset(text_asset_class)
        text_file = instantiate_asset(text_asset)['Local-Path']

        text = open(text_file,'r').read()
        image_width, image_heigth, text_fragments = misc.extract_text_from_hocr(text)

        w = (image_width/DPI)*inch
        h = (image_heigth/DPI)*inch

        rw = w/image_width
        rh = h/image_heigth

        for fragment in text_fragments:
            canvas.drawString(rw * fragment['x'], h - rh *fragment['y'], fragment['text'])
        canvas.drawInlineImage( image, 0, 0, w, h)
        canvas.setPageSize((w,h))
        canvas.showPage()

        os.remove(image_file)
        os.remove(text_file)

    canvas.save()

    return output_buffer