def test_page_has_word(self): page = Page('5') expected_line = Line('bob has a train', 1, StubSpellChecker(())) page.append(expected_line) # assert simple match line, line_info = page.find_word('has') self.assertEquals(expected_line, line) self.assertIsNone(line_info) next_line = Line('he has no friends', 1, StubSpellChecker(())) page.append(next_line) # assert picks first line line, line_info = page.find_word('has') self.assertEquals(expected_line, line) # assert can find in second line line, line_info = page.find_word('friends') self.assertEquals(next_line, line) line_info_1 = LineInfo(1) line_info_2 = LineInfo(2) line_info_3 = LineInfo(3) page.line_infos = [line_info_1, line_info_2, line_info_3] # assert finds correct line info no header line, line_info = page.find_word('friends') self.assertEquals(line_info_2, line_info) page.has_header = True # assert finds correct line info with header line, line_info = page.find_word('friends') self.assertEquals(line_info_3, line_info) # assert raises if not found self.assertRaises(NoWordException, page.find_word, 'notinstrings')
def OnAddLine(self, event): new_line_y = self.current_height + (IMG_HEIGHT / 2) line_info = LineInfo(new_line_y) inserted = False for idx, li in enumerate(self.line_infos): if li.y > new_line_y: self.line_infos.insert(idx, line_info) inserted = True break if not inserted: self.line_infos.append(line_info) self.set_image()
def set_page_nbr(self, page_nbr): self.page_nbr = page_nbr self.frame.image = Image.open('images/pages/{}.pbm'.format(page_nbr)).convert('RGB') self.frame.page_nbr = page_nbr self.frame.line_infos = [] with open('working/page_info/{}.csv'.format(page_nbr), 'r') as f: reader = csv.reader(f) for row in reader: line_info = LineInfo(int(row[3])) line_info.height = int(row[1]) line_info.left_margin = int(row[2]) line_info.width = int(row[4]) self.frame.line_infos.append(line_info) self.frame.show_panel()
def load(self, raw_file_dir): """ Creates lines out of all the lines in a directory.""" line_infos = defaultdict(lambda:[]) if os.path.exists('working/page_info'): for filename in os.listdir('working/page_info'): if filename.endswith('.csv'): print 'parsing', filename with open('working/page_info/{}'.format(filename), 'rb') as f: reader = csv.reader(f) for row in reader: line_info = LineInfo(int(row[3])) line_info.height = int(row[1]) line_info.left_margin = int(row[2]) line_info.width = int(row[4]) line_infos[row[0]].append(line_info) headers = set() if os.path.exists('working/headers.txt'): with open('working/headers.txt', 'rb') as f: for l in f: try: headers.add(l.split('|')[0]) except: pass for fn in sorted(os.listdir(raw_file_dir), key=lambda x: int(os.path.splitext(x)[0])): basename, ext = os.path.splitext(fn) if int(basename) < self.start_page or ext != '.txt': continue if self.end_page > 0 and int(basename) > self.end_page: break with codecs.open('{}/{}'.format(raw_file_dir, fn), mode='r', encoding='utf-8') as f: if self.verbose: print 'Loading page {:>3}'.format(basename) self.pages[basename] = Page(basename, fn in headers, line_infos[basename]) idx = 1 for l in f: line = Line(l.strip(), idx, self.spell_checker) self.pages[basename].append(line) idx += 1 self.average_length = self.calculate_average_length() self.average_lines_per_page = sum([len(lines) for lines in self.pages.values()])/len(self.pages) self.page_numbers = sorted(self.pages.keys(), key=lambda x: int(x))