def test_page_has_word(self):
        page = Page('5')
        expected_line = Line('bob has a train', 1, StubSpellChecker(()))
        page.append(expected_line)
        # assert simple match
        line, line_info = page.find_word('has')
        self.assertEquals(expected_line, line)
        self.assertIsNone(line_info)
        next_line = Line('he has no friends', 1, StubSpellChecker(()))
        page.append(next_line)
        # assert picks first line
        line, line_info = page.find_word('has')
        self.assertEquals(expected_line, line)

        # assert can find in second line
        line, line_info = page.find_word('friends')
        self.assertEquals(next_line, line)
        line_info_1 = LineInfo(1)
        line_info_2 = LineInfo(2)
        line_info_3 = LineInfo(3)
        page.line_infos = [line_info_1, line_info_2, line_info_3]

        # assert finds correct line info no header
        line, line_info = page.find_word('friends')
        self.assertEquals(line_info_2, line_info)

        page.has_header = True
        # assert finds correct line info with header
        line, line_info = page.find_word('friends')
        self.assertEquals(line_info_3, line_info)

        # assert raises if not found
        self.assertRaises(NoWordException, page.find_word, 'notinstrings')
Beispiel #2
0
    def set_page_nbr(self, page_nbr):
        self.page_nbr = page_nbr
        self.frame.image = Image.open('images/pages/{}.pbm'.format(page_nbr)).convert('RGB')
        self.frame.page_nbr = page_nbr

        self.frame.line_infos = []
        with open('working/page_info/{}.csv'.format(page_nbr), 'r') as f:
            reader = csv.reader(f)
            for row in reader:
                line_info = LineInfo(int(row[3]))
                line_info.height = int(row[1])
                line_info.left_margin = int(row[2])
                line_info.width = int(row[4])
                self.frame.line_infos.append(line_info)
        self.frame.show_panel()
Beispiel #3
0
    def load(self, raw_file_dir):
        """ Creates lines out of all the lines in a directory."""
        line_infos = defaultdict(lambda:[])
        if os.path.exists('working/page_info'):
            for filename in os.listdir('working/page_info'):
                if filename.endswith('.csv'):
                    print 'parsing', filename
                    with open('working/page_info/{}'.format(filename), 'rb') as f:
                        reader = csv.reader(f)
                        for row in reader:
                            line_info = LineInfo(int(row[3]))
                            line_info.height = int(row[1])
                            line_info.left_margin = int(row[2])
                            line_info.width = int(row[4])
                            line_infos[row[0]].append(line_info)
        headers = set()
        if os.path.exists('working/headers.txt'):
            with open('working/headers.txt', 'rb') as f:
                for l in f:
                    try:
                        headers.add(l.split('|')[0])
                    except:
                        pass


        for fn in sorted(os.listdir(raw_file_dir), key=lambda x: int(os.path.splitext(x)[0])):
            
	    basename, ext = os.path.splitext(fn)
            if int(basename) < self.start_page or ext != '.txt':
                continue

            if self.end_page > 0 and int(basename) > self.end_page:
                break
            with codecs.open('{}/{}'.format(raw_file_dir, fn), mode='r', encoding='utf-8') as f:
                if self.verbose:
                    print 'Loading page {:>3}'.format(basename)
                self.pages[basename] = Page(basename, fn in headers, line_infos[basename])
                idx = 1
                for l in f:
                    line = Line(l.strip(), idx, self.spell_checker)
                    self.pages[basename].append(line)
                    idx += 1
        self.average_length = self.calculate_average_length()
        self.average_lines_per_page = sum([len(lines) for lines in self.pages.values()])/len(self.pages)
        self.page_numbers = sorted(self.pages.keys(), key=lambda x: int(x))
Beispiel #4
0
 def OnAddLine(self, event):
     new_line_y = self.current_height + (IMG_HEIGHT / 2)
     line_info = LineInfo(new_line_y)
     inserted = False
     for idx, li in enumerate(self.line_infos):
         if li.y > new_line_y:
             self.line_infos.insert(idx, line_info)
             inserted = True
             break
     if not inserted:
         self.line_infos.append(line_info)
     self.set_image()