Esempio n. 1
0
    def set_page_nbr(self, page_nbr):
        self.page_nbr = page_nbr
        self.frame.image = Image.open('images/pages/{}.pbm'.format(page_nbr)).convert('RGB')
        self.frame.page_nbr = page_nbr

        self.frame.line_infos = []
        with open('working/page_info/{}.csv'.format(page_nbr), 'r') as f:
            reader = csv.reader(f)
            for row in reader:
                line_info = LineInfo(int(row[3]))
                line_info.height = int(row[1])
                line_info.left_margin = int(row[2])
                line_info.width = int(row[4])
                self.frame.line_infos.append(line_info)
        self.frame.show_panel()
Esempio n. 2
0
    def load(self, raw_file_dir):
        """ Creates lines out of all the lines in a directory."""
        line_infos = defaultdict(lambda:[])
        if os.path.exists('working/page_info'):
            for filename in os.listdir('working/page_info'):
                if filename.endswith('.csv'):
                    print 'parsing', filename
                    with open('working/page_info/{}'.format(filename), 'rb') as f:
                        reader = csv.reader(f)
                        for row in reader:
                            line_info = LineInfo(int(row[3]))
                            line_info.height = int(row[1])
                            line_info.left_margin = int(row[2])
                            line_info.width = int(row[4])
                            line_infos[row[0]].append(line_info)
        headers = set()
        if os.path.exists('working/headers.txt'):
            with open('working/headers.txt', 'rb') as f:
                for l in f:
                    try:
                        headers.add(l.split('|')[0])
                    except:
                        pass


        for fn in sorted(os.listdir(raw_file_dir), key=lambda x: int(os.path.splitext(x)[0])):
            
	    basename, ext = os.path.splitext(fn)
            if int(basename) < self.start_page or ext != '.txt':
                continue

            if self.end_page > 0 and int(basename) > self.end_page:
                break
            with codecs.open('{}/{}'.format(raw_file_dir, fn), mode='r', encoding='utf-8') as f:
                if self.verbose:
                    print 'Loading page {:>3}'.format(basename)
                self.pages[basename] = Page(basename, fn in headers, line_infos[basename])
                idx = 1
                for l in f:
                    line = Line(l.strip(), idx, self.spell_checker)
                    self.pages[basename].append(line)
                    idx += 1
        self.average_length = self.calculate_average_length()
        self.average_lines_per_page = sum([len(lines) for lines in self.pages.values()])/len(self.pages)
        self.page_numbers = sorted(self.pages.keys(), key=lambda x: int(x))