def get_page_text(db, philo_id, page_num, filename, path, bytes):
    philo_id = str(philo_id) + ' %'
    conn = db.dbh
    c = conn.cursor()
    c.execute("select start_byte, end_byte from pages where philo_id like ? and n=? limit 1", (philo_id,page_num))
    try:
        start_byte, end_byte = c.fetchone()
    except TypeError:   ## returns None because there are no pages in the doc
        return ''
    length = int(end_byte) - int(start_byte)
    file_path = path + '/data/TEXT/' + filename
    file = open(file_path)        
    file.seek(start_byte)
    text = file.read(length)
    sorted_bytes = sorted(bytes.split('+'))
    if bytes and int(start_byte) < int(sorted_bytes[0]) < int(end_byte):
        bytes = sorted([int(byte) - int(start_byte) for byte in bytes.split('+')])
        text_start, text_middle, text_end = chunkifier(text, bytes, highlight=True)
        highlighted_text = text_start + text_middle + text_end
        highlighted_text = re.sub('<(/?span[^>]*)>', '[\\1]', highlighted_text)
        highlighted_text = format(highlighted_text).decode("utf-8","ignore")
        highlighted_text = highlighted_text.replace('[', '<').replace(']', '>')
        return highlighted_text
    else:
        return format(text).decode("utf-8","ignore")
Exemple #2
0
def get_page_text(db, obj, page_num, path, bytes):
    page = obj.get_page()
    length = int(page['end_byte']) - int(page['start_byte'])
    file_path = path + '/data/TEXT/' + obj.filename
    file = open(file_path)        
    file.seek(page['start_byte'])
    text = file.read(length)
    sorted_bytes = sorted(bytes.split('+'))
    if bytes and int(page['start_byte']) < int(sorted_bytes[0]) < int(page['end_byte']):
        bytes = sorted([int(byte) - int(page['start_byte']) for byte in bytes.split('+')])
        return format(text, bytes).decode('utf-8', 'ignore')
    else:
        return format(text).decode("utf-8","ignore")
Exemple #3
0
def get_page_text(db, obj, page_num, path, bytes):
    page = obj.get_page()
    length = int(page['end_byte']) - int(page['start_byte'])
    file_path = path + '/data/TEXT/' + obj.filename
    file = open(file_path)
    file.seek(page['start_byte'])
    text = file.read(length)
    sorted_bytes = sorted(bytes.split('+'))
    if bytes and int(page['start_byte']) < int(sorted_bytes[0]) < int(
            page['end_byte']):
        bytes = sorted(
            [int(byte) - int(page['start_byte']) for byte in bytes.split('+')])
        return format(text, bytes).decode('utf-8', 'ignore')
    else:
        return format(text).decode("utf-8", "ignore")
def get_text_obj(obj, path, query_args=False):
    filename = obj.filename
    if filename and exists(path + "/data/TEXT/" + filename):
        path += "/data/TEXT/" + filename
    else:
        ## workaround for when no filename is returned with the full philo_id of the object
        philo_id = obj.philo_id[0] + ' 0 0 0 0 0 0'
        c = obj.db.dbh.cursor()
        c.execute("select filename from toms where philo_type='doc' and philo_id =? limit 1", (philo_id,))
        path += "data/TEXT/" + c.fetchone()["filename"]
    file = open(path)
    byte_start = obj.byte_start
    file.seek(byte_start)
    width = obj.byte_end - byte_start
    raw_text = file.read(width)


    if query_args:
        bytes = sorted([int(byte) - byte_start for byte in query_args.split('+')])
    else:
        bytes = []
    formatted = format(raw_text,bytes).decode("utf-8","ignore")
    page_obj = obj.get_page()
    if page_obj:
            if page_obj['n'] and page_obj['img']:
                find_head = re.search("<b class=\"headword\">([^\n\r]*?)<br/></b>",formatted)
                if find_head:
                    href = 'http://artflx.uchicago.edu/images/encyclopedie/' + page_obj["img"]
                    page = "[page " + page_obj["n"] + "]"
                    formatted = formatted[:find_head.end(1)] + " <a href='%s' class='page_image_link'>%s</a>" % (href,page) + formatted[find_head.end(1):]
    return formatted
Exemple #5
0
def get_text_obj(obj, path, query_args=False):
    filename = obj.doc.filename
    if filename and exists(path + "/data/TEXT/" + filename):
        path += "/data/TEXT/" + filename
    else:
        ## workaround for when no filename is returned with the full philo_id of the object
        philo_id = obj.philo_id[0] + ' 0 0 0 0 0 0'
        c = obj.db.dbh.cursor()
        c.execute(
            "select filename from toms where philo_type='doc' and philo_id =? limit 1",
            (philo_id, ))
        path += "/data/TEXT/" + c.fetchone()["filename"]
    file = open(path)
    byte_start = obj.byte_start
    file.seek(byte_start)
    width = obj.byte_end - byte_start
    raw_text = file.read(width)

    if query_args:
        bytes = sorted(
            [int(byte) - byte_start for byte in query_args.split('+')])
    else:
        bytes = []

    formatted = format(raw_text, bytes).decode("utf-8", "ignore")
    return formatted
def get_page_text(db, obj, page_num, path, bytes):
    page = obj.get_page()
    print >> sys.stderr, "OBJ_ID", obj.philo_id,
    print >> sys.stderr, "TYPE", obj.type
    print >> sys.stderr, 'PAGE', obj.page
    print >> sys.stderr, 'PHILO_ID', page['philo_id']
    length = int(page['end_byte']) - int(page['start_byte'])
    file_path = path + '/data/TEXT/' + obj.filename
    file = open(file_path)        
    file.seek(page['start_byte'])
    text = file.read(length)
    sorted_bytes = sorted(bytes.split('+'))
    if bytes and int(page['start_byte']) < int(sorted_bytes[0]) < int(page['end_byte']):
        bytes = sorted([int(byte) - int(page['start_byte']) for byte in bytes.split('+')])
        return format(text, bytes).decode('utf-8', 'ignore')
    else:
        return format(text).decode("utf-8","ignore")
def get_text_obj(obj, path, query_args=False):
    filename = obj.filename
    if filename and exists(path + "/data/TEXT/" + filename):
        path += "/data/TEXT/" + filename
    else:
        ## workaround for when no filename is returned with the full philo_id of the object
        philo_id = obj.philo_id[0] + ' 0 0 0 0 0 0'
        c = obj.db.dbh.cursor()
        c.execute("select filename from toms where philo_type='doc' and philo_id =? limit 1", (philo_id,))
        path += "data/TEXT/" + c.fetchone()["filename"]
    file = open(path)
    byte_start = obj.byte_start
    file.seek(byte_start)
    width = obj.byte_end - byte_start
    raw_text = file.read(width)
    if query_args:
        bytes = sorted([int(byte) - byte_start for byte in query_args.split('+')])
    else:
        bytes = []
    return format(raw_text,bytes).decode("utf-8","ignore")