def extract_text(doc, data): if doc['mimetype'] == 'text/html': doc = lxml.html.fromstring(data) text = doc.xpath('//div[@class="Section2"]')[0].text_content() return text else: return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(oyster_doc, data): doc = lxml.html.fromstring(data) pre = doc.xpath('//pre') if pre: text = pre[0].text_content().encode('ascii', 'replace') return text_after_line_numbers(text) else: return '\n'.join(x.text_content() for x in doc.xpath('//tr/td[2]'))
def extract_text(doc, data): doc = lxml.html.fromstring(data) pre = doc.xpath("//pre") if pre: text = pre[0].text_content().encode("ascii", "replace") return text_after_line_numbers(text) else: return "\n".join(x.text_content() for x in doc.xpath("//tr/td[2]"))
def extract_text(doc, data): doc = lxml.html.fromstring(data) pre = doc.xpath('//pre') if pre: text = pre[0].text_content().encode('ascii', 'replace') return text_after_line_numbers(text) else: return '\n'.join(x.text_content() for x in doc.xpath('//tr/td[2]'))
def extract_text(doc, data): if doc['mimetype'] == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): text = pdfdata_to_text(data) return text_after_line_numbers(text)
def extract_text(oyster_doc, data): if oyster_doc['metadata']['mimetype'] == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): is_pdf = (doc['mimetype'] == 'application/pdf' or doc['url'].endswith('.pdf')) if is_pdf: return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): text = pdfdata_to_text(data) return text_after_line_numbers(text).encode('ascii', 'ignore')
def extract_text(oyster_doc, data): doc = lxml.html.fromstring(data) text = doc.xpath('//pre')[0].text_content() text = text_after_line_numbers(text) return text
def extract_text(doc, data): is_pdf = doc["mimetype"] == "application/pdf" or doc["url"].endswith(".pdf") if is_pdf: return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(oyster_doc, data): return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): doc = lxml.html.fromstring(data) text = doc.xpath('//pre')[0].text_content() # strip two sets of line numbers return text_after_line_numbers(text_after_line_numbers(text))
def extract_text(doc, data): if doc["mimetype"] == "application/pdf": return text_after_line_numbers(pdfdata_to_text(data))