def extract_text(oyster_doc, data): text = pdfdata_to_text(data) lines = text.splitlines() line_num_re = re.compile('\s*-\d+-') # number: -#- for i, line in enumerate(lines): if 'LEGISLATIVE RESOLUTION' in line: break text = ' '.join(line for line in lines[i:] if not line_num_re.match(line)) return text
def extract_text(oyster_doc, data): if oyster_doc["metadata"]["mimetype"] == "application/pdf": return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(oyster_doc, data): text = pdfdata_to_text(data) return text_after_line_numbers(text)
def extract_text(oyster_doc, data): return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(oyster_doc, data): if oyster_doc['metadata']['mimetype'] == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(data))
def extract_text(oyster_doc, data): lines = pdfdata_to_text(data).splitlines() no_big_indent = re.compile('^\s{0,10}\S') text = '\n'.join(line for line in lines if no_big_indent.match(line)) return text
def extract_text(oyster_doc, data): return ' '.join(line for line in pdfdata_to_text(data).splitlines() if re.findall('[a-z]', line))
def extract_text(oyster_doc, data): text = pdfdata_to_text(data) return text_after_line_numbers(text).encode('ascii', 'ignore')
def extract_text(oyster_doc, data): is_pdf = (oyster_doc['metadata']['mimetype'] == 'application/pdf' or oyster_doc['url'].endswith('.pdf')) if is_pdf: return text_after_line_numbers(pdfdata_to_text(data))