def apply(self, page, c): output = common.copyElementAttributes(page) line_info = None for text in page.iter('TEXT'): for token in text.iter('TOKEN'): token_info = common.tokenExtract(token) if (line_info == None): line_info = token_info line = etree.Element('LINE') if (common.looseCompare(token_info['y'], line_info['y'], c['y_height_diff'])): # same line line.append(token) else: # new line output.append(line) # reset line = etree.Element('LINE') line.append(token) line_info = token_info # handle last line if (line_info != None): output.append(line) for line in output.iter('LINE'): self.lineSummary(line) return output
def apply(self, page, c): output = common.copyElementAttributes(page) paragraph = etree.Element('PARAGRAPH') lines = page.getchildren() if (len(lines) > 1): paragraph.append(lines[0]) for i in xrange(0, len(lines) - 1): current = lines[i] nxt = lines[i + 1] c_info = common.lineExtract(current) n_info = common.lineExtract(nxt) if (common.looseCompare(c_info['height'], n_info['height'], c['line_height_diff'])): # same para paragraph.append(nxt) else: # new para output.append(paragraph) paragraph = etree.Element('PARAGRAPH') paragraph.append(nxt) # add final paragraph output.append(paragraph) return output
def apply(self,page,c): output = common.copyElementAttributes(page) line_info = None for text in page.iter('TEXT'): for token in text.iter('TOKEN'): token_info = common.tokenExtract(token) if (line_info == None): line_info = token_info line = etree.Element('LINE') if (common.looseCompare(token_info['y'],line_info['y'],c['y_height_diff'])): # same line line.append(token) else: # new line output.append(line) # reset line = etree.Element('LINE') line.append(token) line_info = token_info # handle last line if (line_info != None): output.append(line) for line in output.iter('LINE'): self.lineSummary(line) return output
def apply(self, page, c): output = common.copyElementAttributes(page) paragraph = etree.Element('PARAGRAPH') lines = page.getchildren() if (len(lines) > 1): paragraph.append(lines[0]) for i in xrange(0, len(lines) - 1): current = lines[i] nxt = lines[i + 1] c_info = common.lineExtract(current) n_info = common.lineExtract(nxt) if (common.looseCompare(c_info['height'],n_info['height'],c['line_height_diff'])): # same para paragraph.append(nxt) else: # new para output.append(paragraph) paragraph = etree.Element('PARAGRAPH') paragraph.append(nxt) # add final paragraph output.append(paragraph) return output
def apply(self, page, c): output = common.copyElementAttributes(page) for paragraph in page.iter('PARAGRAPH'): paras = self.split(paragraph, c) for para in paras: output.append(para) return output
def apply(self, page, c): output = common.copyElementAttributes(page) for paragraph in page.iter('PARAGRAPH'): paras = self.split(paragraph,c ) for para in paras: output.append(para) return output