def extract_text(doc, data): if doc['mimetype'] == 'text/html': doc = lxml.html.fromstring(data) text = doc.xpath('//div[@class="Section2"]')[0].text_content() return text else: return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): text = pdfdata_to_text(data) lines = text.splitlines() line_num_re = re.compile("\s*-\d+-") # number: -#- for i, line in enumerate(lines): if "LEGISLATIVE RESOLUTION" in line: break text = " ".join(line for line in lines[i:] if not line_num_re.match(line)) return text
def extract_text(doc, data): text = pdfdata_to_text(data) lines = text.splitlines() line_num_re = re.compile('\s*-\d+-') # number: -#- for i, line in enumerate(lines): if 'LEGISLATIVE RESOLUTION' in line: break text = ' '.join(line for line in lines[i:] if not line_num_re.match(line)) return text
def extract_text(oyster_doc, data): if oyster_doc['metadata']['mimetype'] == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): return ' '.join(line for line in pdfdata_to_text(data).splitlines() if re.findall('[a-z]', line)).decode('utf8')
def extract_text(oyster_doc, data): return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): text = pdfdata_to_text(data) return text_after_line_numbers(text)
def extract_text(oyster_doc, data): return ' '.join(line for line in pdfdata_to_text(data).splitlines() if re.findall('[a-z]', line))
def extract_text(doc, data): text = pdfdata_to_text(data) return text_after_line_numbers(text).encode('ascii', 'ignore')
def extract_text(doc, data): is_pdf = doc["mimetype"] == "application/pdf" or doc["url"].endswith(".pdf") if is_pdf: return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): if doc["mimetype"] == "application/pdf": return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): is_pdf = (doc['mimetype'] == 'application/pdf' or doc['url'].endswith('.pdf')) if is_pdf: return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): lines = pdfdata_to_text(data).splitlines() no_big_indent = re.compile('^\s{0,10}\S') text = '\n'.join(line for line in lines if no_big_indent.match(line)) return text
def extract_text(doc, data): return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): if doc['mimetype'] == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(doc, data): return " ".join(line for line in pdfdata_to_text(data).splitlines() if re.findall("[a-z]", line))
def extract_text(oyster_doc, data): lines = pdfdata_to_text(data).splitlines() no_big_indent = re.compile("^\s{0,10}\S") text = "\n".join(line for line in lines if no_big_indent.match(line)) return text