def extract_text(oyster_doc, data): doc = lxml.html.fromstring(data) pre = doc.xpath('//pre') if pre: text = pre[0].text_content().encode('ascii', 'replace') return text_after_line_numbers(text) else: return '\n'.join(x.text_content() for x in doc.xpath('//tr/td[2]'))
def extract_text(oyster_doc, data): if oyster_doc["metadata"]["mimetype"] == "application/pdf": return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(oyster_doc, data): text = pdfdata_to_text(data) return text_after_line_numbers(text)
def extract_text(oyster_doc, data): return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(oyster_doc, data): if oyster_doc['metadata']['mimetype'] == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(oyster_doc, data): doc = lxml.html.fromstring(data) text = doc.xpath('//pre')[0].text_content() text = text_after_line_numbers(text) return text
def extract_text(oyster_doc, data): text = pdfdata_to_text(data) return text_after_line_numbers(text).encode('ascii', 'ignore')
def extract_text(oyster_doc, data): is_pdf = (oyster_doc['metadata']['mimetype'] == 'application/pdf' or oyster_doc['url'].endswith('.pdf')) if is_pdf: return text_after_line_numbers(pdfdata_to_text(data))