def set_page_nbr(self, page_nbr): self.page_nbr = page_nbr self.frame.image = Image.open('images/pages/{}.pbm'.format(page_nbr)).convert('RGB') self.frame.page_nbr = page_nbr self.frame.line_infos = [] with open('working/page_info/{}.csv'.format(page_nbr), 'r') as f: reader = csv.reader(f) for row in reader: line_info = LineInfo(int(row[3])) line_info.height = int(row[1]) line_info.left_margin = int(row[2]) line_info.width = int(row[4]) self.frame.line_infos.append(line_info) self.frame.show_panel()
def load(self, raw_file_dir): """ Creates lines out of all the lines in a directory.""" line_infos = defaultdict(lambda:[]) if os.path.exists('working/page_info'): for filename in os.listdir('working/page_info'): if filename.endswith('.csv'): print 'parsing', filename with open('working/page_info/{}'.format(filename), 'rb') as f: reader = csv.reader(f) for row in reader: line_info = LineInfo(int(row[3])) line_info.height = int(row[1]) line_info.left_margin = int(row[2]) line_info.width = int(row[4]) line_infos[row[0]].append(line_info) headers = set() if os.path.exists('working/headers.txt'): with open('working/headers.txt', 'rb') as f: for l in f: try: headers.add(l.split('|')[0]) except: pass for fn in sorted(os.listdir(raw_file_dir), key=lambda x: int(os.path.splitext(x)[0])): basename, ext = os.path.splitext(fn) if int(basename) < self.start_page or ext != '.txt': continue if self.end_page > 0 and int(basename) > self.end_page: break with codecs.open('{}/{}'.format(raw_file_dir, fn), mode='r', encoding='utf-8') as f: if self.verbose: print 'Loading page {:>3}'.format(basename) self.pages[basename] = Page(basename, fn in headers, line_infos[basename]) idx = 1 for l in f: line = Line(l.strip(), idx, self.spell_checker) self.pages[basename].append(line) idx += 1 self.average_length = self.calculate_average_length() self.average_lines_per_page = sum([len(lines) for lines in self.pages.values()])/len(self.pages) self.page_numbers = sorted(self.pages.keys(), key=lambda x: int(x))