def split(self, paragraph, c): lines = paragraph.getchildren() indent = self.estimateIndent(paragraph, c) justify = self.estimateJustify(paragraph, c) output = [] tmp = etree.Element('PARAGRAPH') info = None for i in xrange(0, len(lines)): info = common.lineExtract(lines[i]) if (common.looseCompare(info['left'], indent, c['para_indent_diff'])): # same para tmp.append(lines[i]) elif (i == 0): # indented first line tmp.append(lines[i]) else: # new para tmp.set('complete', 'yes') output.append(tmp) tmp = etree.Element('PARAGRAPH') tmp.append(lines[i]) if (info != None): if (common.looseCompare(info['right'], justify, c['para_indent_diff'])): if (len(tmp) == 1): tmp.set('complete', 'yes') else: tmp.set('complete', 'no') else: tmp.set('complete', 'yes') output.append(tmp) return output
def split(self, paragraph, c): lines = paragraph.getchildren() indent = self.estimateIndent(paragraph, c) justify = self.estimateJustify(paragraph, c) output = [] tmp = etree.Element('PARAGRAPH') info = None for i in xrange(0,len(lines)): info = common.lineExtract(lines[i]) if (common.looseCompare(info['left'],indent,c['para_indent_diff'])): # same para tmp.append(lines[i]) elif (i == 0): # indented first line tmp.append(lines[i]) else: # new para tmp.set('complete', 'yes') output.append(tmp) tmp = etree.Element('PARAGRAPH') tmp.append(lines[i]) if (info != None): if (common.looseCompare(info['right'],justify,c['para_indent_diff'])): if (len(tmp) == 1): tmp.set('complete', 'yes') else: tmp.set('complete', 'no') else: tmp.set('complete', 'yes') output.append(tmp) return output
def apply(self, page, c): output = common.copyElementAttributes(page) line_info = None for text in page.iter('TEXT'): for token in text.iter('TOKEN'): token_info = common.tokenExtract(token) if (line_info == None): line_info = token_info line = etree.Element('LINE') if (common.looseCompare(token_info['y'], line_info['y'], c['y_height_diff'])): # same line line.append(token) else: # new line output.append(line) # reset line = etree.Element('LINE') line.append(token) line_info = token_info # handle last line if (line_info != None): output.append(line) for line in output.iter('LINE'): self.lineSummary(line) return output
def split(self, paragraph, c): output = [] lines = paragraph.getchildren() common_diff = self.lineDiff(lines) tmp = etree.Element('PARAGRAPH') if (len(lines) > 0): tmp.append(lines[0]) for i in xrange(0, len(lines) - 1): a = lines[i] b = lines[i + 1] a_values = common.lineExtract(a) b_values = common.lineExtract(b) diff = b_values['base'] - a_values['base'] if (common.looseCompare(common_diff, diff, c['vertical_diff'])): # same tmp.append(b) else: # split output.append(tmp) tmp = etree.Element('PARAGRAPH') tmp.append(b) output.append(tmp) return output
def apply(self, page, c): output = common.copyElementAttributes(page) paragraph = etree.Element('PARAGRAPH') lines = page.getchildren() if (len(lines) > 1): paragraph.append(lines[0]) for i in xrange(0, len(lines) - 1): current = lines[i] nxt = lines[i + 1] c_info = common.lineExtract(current) n_info = common.lineExtract(nxt) if (common.looseCompare(c_info['height'], n_info['height'], c['line_height_diff'])): # same para paragraph.append(nxt) else: # new para output.append(paragraph) paragraph = etree.Element('PARAGRAPH') paragraph.append(nxt) # add final paragraph output.append(paragraph) return output
def apply(self,page,c): output = common.copyElementAttributes(page) line_info = None for text in page.iter('TEXT'): for token in text.iter('TOKEN'): token_info = common.tokenExtract(token) if (line_info == None): line_info = token_info line = etree.Element('LINE') if (common.looseCompare(token_info['y'],line_info['y'],c['y_height_diff'])): # same line line.append(token) else: # new line output.append(line) # reset line = etree.Element('LINE') line.append(token) line_info = token_info # handle last line if (line_info != None): output.append(line) for line in output.iter('LINE'): self.lineSummary(line) return output
def apply(self, page, c): output = common.copyElementAttributes(page) paragraph = etree.Element('PARAGRAPH') lines = page.getchildren() if (len(lines) > 1): paragraph.append(lines[0]) for i in xrange(0, len(lines) - 1): current = lines[i] nxt = lines[i + 1] c_info = common.lineExtract(current) n_info = common.lineExtract(nxt) if (common.looseCompare(c_info['height'],n_info['height'],c['line_height_diff'])): # same para paragraph.append(nxt) else: # new para output.append(paragraph) paragraph = etree.Element('PARAGRAPH') paragraph.append(nxt) # add final paragraph output.append(paragraph) return output
def split(self, paragraph, c): output = [] lines = paragraph.getchildren() common_diff = self.lineDiff(lines) tmp = etree.Element('PARAGRAPH') if (len(lines) > 0): tmp.append(lines[0]) for i in xrange(0,len(lines) - 1): a = lines[i] b = lines[i + 1] a_values = common.lineExtract(a) b_values = common.lineExtract(b) diff = b_values['base'] - a_values['base'] if (common.looseCompare(common_diff,diff,c['vertical_diff'])): # same tmp.append(b) else: # split output.append(tmp) tmp = etree.Element('PARAGRAPH') tmp.append(b) output.append(tmp) return output