예제 #1
0
def main(argv):

    if len(argv) != 3:
        print 'usage: python %s <PDF-File> <page-num>' % argv[0]
        exit(1)

    filename = argv[1]
    page_num = int(argv[2])

    page = PDFPage.extract_texts(filename, [page_num])[0]
    for ix, w in enumerate(page.words):
        out = '%02d: %s' % (ix + 1, w.t)
        print out.encode('utf8')

    print '-----------------------------------------------------------'

    preprocessor = RawTextPreprocessor(filename, page)
    page = preprocessor.run()
    for ix, w in enumerate(page.words):
        out = '%02d: %s' % (ix + 1, w.t)
        print out.encode('utf8')

    print '-----------------------------------------------------------'

    raw_texts = PDFPage.extract_raw_texts(filename, page_num)
    for ix, w in enumerate(raw_texts):
        out = '%02d: %s' % (ix + 1, w)
        print out.encode('utf8')
예제 #2
0
파일: raw.py 프로젝트: yu-liang-kono/Thor
    def run(self):
        """Main function.

        Merging words by taking advantage of raw stream content.
        Reducing number of words of a PDFPage.

        Returns:
            A PDFPage instance.

        """

        ret = PDFPage(page_num=self.page.page_num,
                      width=self.page.width,
                      height=self.page.height,
                      words=[])

        can_merge_streams = set()
        keep_merging = True
        while keep_merging:
            keep_merging = False

            # the more lengthy words merge earlier
            sorted_streams = sorted(
                zip(xrange(len(self.raw_streams)), self.raw_streams),
                key=lambda (s_ix, s): -1 * len(s._stream)
            )
            for stream_ix, stream in sorted_streams:
                if stream_ix not in can_merge_streams:
                    if not stream.may_merge():
                        num_matches = len(stream.matches)
                        stream.discard_outliers()

                    if stream.may_merge():
                        can_merge_streams.add(stream_ix)
                        ret.words.append(self._merge_words_of_stream(stream_ix))
                        keep_merging = True


        flags = [True] * len(self.words)
        for stream_ix in can_merge_streams:
            stream = self.raw_streams[stream_ix]
            for match in stream.matches:
                flags[match.index] = False

        for word_ix, word in enumerate(self.words):
            if flags[word_ix]:
                ret.words.append(word._word_obj)

        ret.words = map(PDFText.create_from_dict, ret.words)

        return ret
예제 #3
0
    def run(self):
        """Main function.

        Merging words by taking advantage of raw stream content.
        Reducing number of words of a PDFPage.

        Returns:
            A PDFPage instance.

        """

        ret = PDFPage(page_num=self.page.page_num,
                      width=self.page.width,
                      height=self.page.height,
                      words=[])

        can_merge_streams = set()
        keep_merging = True
        while keep_merging:
            keep_merging = False

            # the more lengthy words merge earlier
            sorted_streams = sorted(zip(xrange(len(self.raw_streams)),
                                        self.raw_streams),
                                    key=lambda (s_ix, s): -1 * len(s._stream))
            for stream_ix, stream in sorted_streams:
                if stream_ix not in can_merge_streams:
                    if not stream.may_merge():
                        num_matches = len(stream.matches)
                        stream.discard_outliers()

                    if stream.may_merge():
                        can_merge_streams.add(stream_ix)
                        ret.words.append(
                            self._merge_words_of_stream(stream_ix))
                        keep_merging = True

        flags = [True] * len(self.words)
        for stream_ix in can_merge_streams:
            stream = self.raw_streams[stream_ix]
            for match in stream.matches:
                flags[match.index] = False

        for word_ix, word in enumerate(self.words):
            if flags[word_ix]:
                ret.words.append(word._word_obj)

        ret.words = map(PDFText.create_from_dict, ret.words)

        return ret
예제 #4
0
def main(argv):

    if len(argv) != 3:
        print 'usage: python %s <PDF-File> <page-num>' % argv[0]
        exit(1)

    filename = argv[1]
    page_num = int(argv[2])

    page = PDFPage.extract_texts(filename, [page_num])[0]
    preprocessor = RawTextPreprocessor(filename, page)
    page = preprocessor.run()
    #with open('raw.txt', 'wb') as f:
    #    f.write(page.serialize())

    preprocessor = NaivePreprocessor(filename, page)
    page = preprocessor.run()
    #with open('naive.txt', 'wb') as f:
    #    f.write(page.serialize())

    preprocessor = FontSpecPreprocessor(filename, page)
    page = preprocessor.run()

    worker = XYCut()
    result = worker.run(page)
    out = '\n-----------------------------------------------\n'.join(result)
    print '\n\n\n'
    print out.encode('utf8')
예제 #5
0
def main(argv):

    if len(argv) != 3:
        print 'usage: python %s <PDF-File> <page-num>' % argv[0]
        exit(1)

    filename = argv[1]
    page_num = int(argv[2])

    page = PDFPage.extract_texts(filename, [page_num])[0]
    preprocessor = RawTextPreprocessor(filename, page)
    page = preprocessor.run()

    preprocessor = NaivePreprocessor(filename, page)
    page = preprocessor.run()

    preprocessor = FontSpecPreprocessor(filename, page)
    page = preprocessor.run()

    cmd = ('pdftocairo', '-f', str(page_num), '-l', str(page_num),
           '-jpeg', '-singlefile', '-cropbox',
           '-scale-to-x', str(int(page.width)), '-scale-to-y', '-1',
           filename, 'output')
    subprocess.check_call(cmd)

    with closing(open('output.js', 'wb')) as f:
        f.write('window.pdfdata=')
        f.write(page.serialize())
        f.write(';')
예제 #6
0
def main(argv):

    if len(argv) != 3:
        print 'usage: python %s <PDF-File> <page-num>' % argv[0]
        exit(1)

    filename = argv[1]
    page_num = int(argv[2])

    page = PDFPage.extract_texts(filename, [page_num])[0]
    preprocessor = RawTextPreprocessor(filename, page)
    page = preprocessor.run()
    #with open('raw.txt', 'wb') as f:
    #    f.write(page.serialize())

    preprocessor = NaivePreprocessor(filename, page)
    page = preprocessor.run()
    #with open('naive.txt', 'wb') as f:
    #    f.write(page.serialize())

    preprocessor = FontSpecPreprocessor(filename, page)
    page = preprocessor.run()

    worker = XYCut()
    result = worker.run(page)
    out = '\n-----------------------------------------------\n'.join(result)
    print '\n\n\n'
    print out.encode('utf8')
예제 #7
0
    def run(self):

        ret = PDFPage(page_num=self.page.page_num,
                      width=self.page.width,
                      height=self.page.height,
                      words=None)

        scale_factor = 1.0 * self._normalize_width / self.page.width
        self._scale_words(scale_factor)

        next_round = []
        while True:
            ismerged = [False] * len(self.words)

            for i, word1 in enumerate(self.words):
                if ismerged[i]:
                    continue

                for j in xrange(i + 1, len(self.words)):
                    if ismerged[j]:
                        continue

                    merged = self.factory.merge(word1, self.words[j])
                    if merged is None:
                        continue

                    ismerged[j] = True
                    word1 = self.words[i] = merged

                next_round.append(word1)

            next_round, self.words = [], next_round

            if not any(ismerged):
                break

        self._scale_words(1.0 / scale_factor)
        ret.words = map(lambda w: PDFText.create_from_dict(w._word_obj),
                        self.words)

        return ret
예제 #8
0
파일: naive.py 프로젝트: yu-liang-kono/Thor
    def run(self):

        ret = PDFPage(page_num=self.page.page_num,
                      width=self.page.width,
                      height=self.page.height,
                      words=None)

        scale_factor = 1.0 * self._normalize_width / self.page.width
        self._scale_words(scale_factor)

        next_round = []
        while True:
            ismerged = [False] * len(self.words)

            for i, word1 in enumerate(self.words):
                if ismerged[i]:
                    continue

                for j in xrange(i + 1, len(self.words)):
                    if ismerged[j]:
                        continue

                    merged = self.factory.merge(word1, self.words[j])
                    if merged is None:
                        continue

                    ismerged[j] = True
                    word1 = self.words[i] = merged

                next_round.append(word1)

            next_round, self.words = [], next_round

            if not any(ismerged):
                break

        self._scale_words(1.0 / scale_factor)
        ret.words = map(lambda w: PDFText.create_from_dict(w._word_obj),
                        self.words)

        return ret
예제 #9
0
    def parse_xml(self, xml_stream):
        """Parse XML and get font spec of every word.

        Args:
            xml_stream: An XML string.

        """

        start = xml_stream.find('<pdf2xml')
        end = xml_stream.find('</pdf2xml>') + 10
        jq = PyQuery(xml_stream[start:end])

        boxes = PDFPage.get_page_bboxes(self.pdf_filename, self.page.page_num)
        crop_box = boxes['crop']

        page_element = jq('page')[0]
        page_width = float(page_element.attrib['width'])
        page_height = float(page_element.attrib['height'])

        fontspec_elements = jq('fontspec')
        for fs in fontspec_elements:
            attr = fs.attrib
            fid, fsize, fcolor = attr['id'], attr['size'], attr['color']
            self._fontspecs[fid] = FontSpec(size=int(fsize), color=fcolor[1:])

        text_elements = jq('text')
        for ix, text in enumerate(text_elements):
            attr = text.attrib
            top, left = float(attr['top']), float(attr['left'])
            width, height = float(attr['width']), float(attr['height'])
            # it is pdftohtml bug
            width = height if width == 0 else width

            if  (top >= page_height or top + height <= 0) or \
                (left + width <= 0 or left > page_width):
                continue

            self._words.append({
                'top': top - crop_box[1],
                'left': left - crop_box[0],
                'width': width,
                'height': height,
                'text': text.text,
                'font': self._fontspecs[attr['font']],
            })
예제 #10
0
def run(input_filename, page_nums, page_dir, output_filename):

    pages = PDFPage.extract_texts(input_filename, page_nums)

    if page_dir is not None:
        for p in pages:
            output_file = os.path.join(page_dir, '%03d.json' % p.page_num)
            with closing(open(output_file, 'wb')) as f:
                f.write(p.serialize())

    output = {
        'version': int(time.time()),
        'file': input_filename,
        'page': len(pages) if page_nums is None else count_pages(input_filename),
        'data': map(lambda p: p.__json__(), pages),
    }

    with closing(open(output_filename, 'wb')) as f:
        f.write(ujson.dumps(output, ensure_ascii=False))
예제 #11
0
    def parse_xml(self, xml_stream):
        """Parse XML and get font spec of every word.

        Args:
            xml_stream: An XML string.

        """

        start = xml_stream.find('<pdf2xml')
        end = xml_stream.find('</pdf2xml>') + 10
        jq = PyQuery(xml_stream[start:end])

        boxes = PDFPage.get_page_bboxes(self.pdf_filename, self.page.page_num)
        crop_box = boxes['crop']

        page_element = jq('page')[0]
        page_width = float(page_element.attrib['width'])
        page_height = float(page_element.attrib['height'])

        fontspec_elements = jq('fontspec')
        for fs in fontspec_elements:
            attr = fs.attrib
            fid, fsize, fcolor = attr['id'], attr['size'], attr['color']
            self._fontspecs[fid] = FontSpec(size=int(fsize), color=fcolor[1:])

        text_elements = jq('text')
        for ix, text in enumerate(text_elements):
            attr = text.attrib
            top, left = float(attr['top']), float(attr['left'])
            width, height = float(attr['width']), float(attr['height'])
            # it is pdftohtml bug
            width = height if width == 0 else width

            if  (top >= page_height or top + height <= 0) or \
                (left + width <= 0 or left > page_width):
                continue

            self._words.append({
                'top': top - crop_box[1], 'left': left - crop_box[0],
                'width': width, 'height': height,
                'text': text.text,
                'font': self._fontspecs[attr['font']],
            })
예제 #12
0
def run(input_filename, page_nums, page_dir, output_filename):

    pages = PDFPage.extract_texts(input_filename, page_nums)

    if page_dir is not None:
        for p in pages:
            output_file = os.path.join(page_dir, '%03d.json' % p.page_num)
            with closing(open(output_file, 'wb')) as f:
                f.write(p.serialize())

    output = {
        'version': int(time.time()),
        'file': input_filename,
        'page':
        len(pages) if page_nums is None else count_pages(input_filename),
        'data': map(lambda p: p.__json__(), pages),
    }

    with closing(open(output_filename, 'wb')) as f:
        f.write(ujson.dumps(output, ensure_ascii=False))
from Thor.pdf.text import PDFText
from Thor.preprocess.fontspec import FontSpecPreprocessor
from Thor.utils.FontSpec import FontSpec


with given.a_FontSpecPreprocessor:

    curr_dir = os.path.abspath(os.path.dirname(__file__))
    sample_pdf = os.path.join(curr_dir, 'fixture', 'chew_people_11.pdf')
    sample_json = os.path.join(curr_dir, 'fixture', 'chew_people_11.json')

    with closing(open(sample_json)) as f:
        pdf_json = f.read().decode('utf8')

    preprocessor = FontSpecPreprocessor(
        sample_pdf, PDFPage.loads(pdf_json)
    )

    with then.it_can_extract_all_font_specs_used_by_a_pdf_page:
        ground_truths = [
            FontSpec(size=6, color="221714"),
            FontSpec(size=5, color="221714"),
            FontSpec(size=38, color="221714"),
            FontSpec(size=27, color="221714"),
            FontSpec(size=8, color="221714"),
            FontSpec(size=4, color="000000"),
        ]
        ground_truths.sort(key=lambda fs: fs.size)

        font_specs = preprocessor.font_specs
        font_specs.sort(key=lambda fs: fs.size)
예제 #14
0

curr_dir = os.path.dirname(os.path.abspath(__file__))

with given.a_pdf:

    sample_pdf = os.path.join(curr_dir, 'fixture', 'test1.pdf')

    with when.get_page_bounding_boxes_of_a_page:
        media_box = (0, 0, 683.15, 853.23)
        crop_box = (36.85, 36.85, 646.30, 816.38)
        bleed_box = (36.85, 36.85, 646.30, 816.38)
        trim_box = (36.85, 36.85, 646.30, 816.38)
        art_box = (36.85, 36.85, 646.30, 816.38)

        bboxes = PDFPage.get_page_bboxes(sample_pdf, 1)

        def bboxes_almost_the_same(bbox1, bbox2):
            for i in xrange(4):
                this(abs(bbox1[i] - bbox2[i])).should.be_less_than(1.0e-3)

        with then.media_box_should_be_correct:
            bboxes_almost_the_same(media_box, bboxes['media'])

        with then.crop_box_should_be_correct:
            bboxes_almost_the_same(crop_box, bboxes['crop'])

        with then.bleed_box_should_be_correct:
            bboxes_almost_the_same(bleed_box, bboxes['bleed'])

        with then.trim_box_should_be_correct:
예제 #15
0
# local library imports
from Thor.pdf.page import PDFPage
from Thor.pdf.text import PDFText
from Thor.preprocess.fontspec import FontSpecPreprocessor
from Thor.utils.FontSpec import FontSpec

with given.a_FontSpecPreprocessor:

    curr_dir = os.path.abspath(os.path.dirname(__file__))
    sample_pdf = os.path.join(curr_dir, 'fixture', 'chew_people_11.pdf')
    sample_json = os.path.join(curr_dir, 'fixture', 'chew_people_11.json')

    with closing(open(sample_json)) as f:
        pdf_json = f.read().decode('utf8')

    preprocessor = FontSpecPreprocessor(sample_pdf, PDFPage.loads(pdf_json))

    with then.it_can_extract_all_font_specs_used_by_a_pdf_page:
        ground_truths = [
            FontSpec(size=6, color="221714"),
            FontSpec(size=5, color="221714"),
            FontSpec(size=38, color="221714"),
            FontSpec(size=27, color="221714"),
            FontSpec(size=8, color="221714"),
            FontSpec(size=4, color="000000"),
        ]
        ground_truths.sort(key=lambda fs: fs.size)

        font_specs = preprocessor.font_specs
        font_specs.sort(key=lambda fs: fs.size)
        the(len(font_specs)).should.equal(len(ground_truths))
예제 #16
0
import ujson

# local library imports
from Thor.pdf.page import PDFPage
from Thor.pdf.text import PDFText
from Thor.preprocess.naive import NaivePreprocessor

with given.a_NaivePreprocessor:

    with when.it_normalizes_text_blocks_to_width_1000px:
        words = map(lambda i: dict(x=1 * i, y=2 * i, w=3 * i, h=4 * i, t=''),
                    xrange(10))
        preprocessor = NaivePreprocessor(
            'test.pdf',
            PDFPage(page_num=1,
                    width=200,
                    height=200,
                    words=map(PDFText.create_from_dict, words)))
        preprocessor._scale_words(1000 / 200.)

        with then.each_word_is_scaled_correctly:
            for ix, word in enumerate(preprocessor.words):
                the(word['x']).should.equal(5 * 1 * ix)
                the(word['y']).should.equal(5 * 2 * ix)
                the(word['w']).should.equal(5 * 3 * ix)
                the(word['h']).should.equal(5 * 4 * ix)

        del preprocessor, words

    with when.it_classifies_each_word_into_three_types_of_orientation:

        words = [{
예제 #17
0
from Thor.utils.Rectangle import Rectangle

curr_dir = os.path.dirname(os.path.abspath(__file__))

with given.a_pdf:

    sample_pdf = os.path.join(curr_dir, 'fixture', 'test1.pdf')

    with when.get_page_bounding_boxes_of_a_page:
        media_box = (0, 0, 683.15, 853.23)
        crop_box = (36.85, 36.85, 646.30, 816.38)
        bleed_box = (36.85, 36.85, 646.30, 816.38)
        trim_box = (36.85, 36.85, 646.30, 816.38)
        art_box = (36.85, 36.85, 646.30, 816.38)

        bboxes = PDFPage.get_page_bboxes(sample_pdf, 1)

        def bboxes_almost_the_same(bbox1, bbox2):
            for i in xrange(4):
                this(abs(bbox1[i] - bbox2[i])).should.be_less_than(1.0e-3)

        with then.media_box_should_be_correct:
            bboxes_almost_the_same(media_box, bboxes['media'])

        with then.crop_box_should_be_correct:
            bboxes_almost_the_same(crop_box, bboxes['crop'])

        with then.bleed_box_should_be_correct:
            bboxes_almost_the_same(bleed_box, bboxes['bleed'])

        with then.trim_box_should_be_correct:
예제 #18
0
import ujson

# local library imports
from Thor.pdf.page import PDFPage
from Thor.preprocess.raw import RawTextPreprocessor

with given.a_RawTextPreprocessor:

    curr_dir = os.path.abspath(os.path.dirname(__file__))
    sample_json = os.path.join(curr_dir, 'fixture', 'test1.json')
    sample_raw = os.path.join(curr_dir, 'fixture', 'test1.rtxt')
    sample_pdf = os.path.join(curr_dir, 'fixture', 'test1.pdf')

    with closing(open(sample_json)) as f:
        preprocessor = RawTextPreprocessor(
            sample_pdf, PDFPage.loads(f.read().decode('utf8')))

    with then.it_extracts_texts_in_content_stream_order:

        raw_texts = preprocessor.page.extract_raw_texts(sample_pdf, 1)
        with closing(open(sample_raw)) as f:
            expected = f.read().decode('utf8').splitlines()

        # XXX The last raw stream is form feed, we ignore it.
        for i in xrange(22):
            the(raw_texts[i]).should.equal(expected[i])

    with then.each_word_obj_should_locate_itself_in_every_possible_raw_stream:

        ground_truth = (
            # 0
# local library imports
from Thor.pdf.page import PDFPage
from Thor.preprocess.raw import RawTextPreprocessor


with given.a_RawTextPreprocessor:

    curr_dir = os.path.abspath(os.path.dirname(__file__))
    sample_json = os.path.join(curr_dir, 'fixture', 'test1.json')
    sample_raw = os.path.join(curr_dir, 'fixture', 'test1.rtxt')
    sample_pdf = os.path.join(curr_dir, 'fixture', 'test1.pdf')

    with closing(open(sample_json)) as f:
        preprocessor = RawTextPreprocessor(
            sample_pdf,
            PDFPage.loads(f.read().decode('utf8'))
        )

    with then.it_extracts_texts_in_content_stream_order:

        raw_texts = preprocessor.page.extract_raw_texts(sample_pdf, 1)
        with closing(open(sample_raw)) as f:
            expected = f.read().decode('utf8').splitlines()

        # XXX The last raw stream is form feed, we ignore it.
        for i in xrange(22):
            the(raw_texts[i]).should.equal(expected[i])

    with then.each_word_obj_should_locate_itself_in_every_possible_raw_stream:

        ground_truth = (