def fixStartChars(self, paragraph, num): lines = paragraph.getchildren() # the first 'non large' word on the page # has the x value which is the same start # value as the lines affected by the large start letter first = lines[0] tokens = first.getchildren() if (num >= len(tokens)): return False first_normal = tokens[num] first_info = common.tokenExtract(first_normal) first_x = first_info['x'] # figure out indent by looking for the first line # with a different indent indent = None for i in xrange(1, len(lines)): line_info = common.lineExtract(lines[i]) if (line_info['left'] != first_x): indent = line_info['left'] break # if we found an indent fix line indents if (indent != None): lines[0].set('left', unicode(indent)) for i in xrange(1, len(lines)): line_info = common.lineExtract(lines[i]) if (line_info['left'] == first_x): lines[i].set('left', unicode(indent)) return True else: return False
def apply(self, page, c): output = common.copyElementAttributes(page) line_info = None for text in page.iter('TEXT'): for token in text.iter('TOKEN'): token_info = common.tokenExtract(token) if (line_info == None): line_info = token_info line = etree.Element('LINE') if (common.looseCompare(token_info['y'], line_info['y'], c['y_height_diff'])): # same line line.append(token) else: # new line output.append(line) # reset line = etree.Element('LINE') line.append(token) line_info = token_info # handle last line if (line_info != None): output.append(line) for line in output.iter('LINE'): self.lineSummary(line) return output
def apply(self,page,c): output = common.copyElementAttributes(page) line_info = None for text in page.iter('TEXT'): for token in text.iter('TOKEN'): token_info = common.tokenExtract(token) if (line_info == None): line_info = token_info line = etree.Element('LINE') if (common.looseCompare(token_info['y'],line_info['y'],c['y_height_diff'])): # same line line.append(token) else: # new line output.append(line) # reset line = etree.Element('LINE') line.append(token) line_info = token_info # handle last line if (line_info != None): output.append(line) for line in output.iter('LINE'): self.lineSummary(line) return output
def numberOfStartChars(self, paragraph): lines = paragraph.getchildren() count = 0 if (len(lines) > 1): for token in lines[0].iter('TOKEN'): info = common.tokenExtract(token) if (info['chars'] == 1): # single character count = count + 1 else: break return count
def apply(self, token, c): info = common.tokenExtract(token) sz = info['font-size'] s = "unknown" if (sz < self.smallest): s = 'xx-small' elif (sz > self.largest): s = 'xx-large' else: v = bisect.bisect_right(self.sizes, sz) s = self.table[self.sizes[v]] token.set('size', unicode(s))
def lineSummary(self, line): base = [] top = [] height = [] left = None right = None chars = 0 for token in line.iter('TOKEN'): info = common.tokenExtract(token) base.append(info['base']) top.append(info['top']) height.append(info['height']) if (left == None): left = info['left'] else: if (info['left'] < left): left = info['left'] if (right == None): right = info['right'] else: if (info['right'] > right): right = info['right'] chars = chars + info['chars'] # apply summary if (len(base) <= 2): line.set('base', unicode(common.largest(base))) else: line.set('base', unicode(common.mostCommon(base))) if (len(top) <= 2): line.set('top', unicode(common.smallest(top))) else: line.set('top', unicode(common.mostCommon(top))) line.set('left', unicode(left)) line.set('right', unicode(right)) if (len(height) <= 2): line.set('height', unicode(common.largest(height))) else: line.set('height', unicode(common.mostCommon(height))) line.set('chars', unicode(chars))