def az_text_extractor(mimetype, doc_source): if mimetype == 'text/html': doc = lxml.html.fromstring(doc_source) text = doc.xpath('//div[@class="Section2"]')[0].text_content() return text else: return text_after_line_numbers(pdfdata_to_text(doc_source))
def ar_text_extractor(doc_source): return text_after_line_numbers(pdfdata_to_text(doc_source))
def in_text_extractor(doc_source): text = pdfdata_to_text(doc_source) return text_after_line_numbers(text)
def dc_text_extractor(doc_source): lines = pdfdata_to_text(doc_source).splitlines() no_big_indent = re.compile('^\s{0,10}\S') text = '\n'.join(line for line in lines if no_big_indent.match(line)) return text
def wi_text_extractor(mimetype, url, data): is_pdf = (mimetype == 'application/pdf' or url.endswith('.pdf')) if is_pdf: return text_after_line_numbers(pdfdata_to_text(data))
def wy_text_extractor(doc_source): return ' '.join(line for line in pdfdata_to_text(doc_source).splitlines() if re.findall('[a-z]', line))
def ut_text_extractor(mimetype, data): if mimetype == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(data))
def tn_text_extractor(data_source): return ' '.join(line for line in pdfdata_to_text(data_source).splitlines() if re.findall('[a-z]', line)).decode('utf8')
def ne_text_extractor(doc_source): text = pdfdata_to_text(doc_source) return text
def mo_text_extractor(doc_source): text = pdfdata_to_text(doc_source) return text_after_line_numbers(text).encode('ascii', 'ignore')
def hi_text_extractor(mimetype, doc_source): if mimetype == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(doc_source)) else: return None