コード例 #1
0
def az_text_extractor(mimetype, doc_source):
    if mimetype == 'text/html':
        doc = lxml.html.fromstring(doc_source)
        text = doc.xpath('//div[@class="Section2"]')[0].text_content()
        return text
    else:
        return text_after_line_numbers(pdfdata_to_text(doc_source))
コード例 #2
0
def fl_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    pre = doc.xpath('//pre')
    if pre:
        text = pre[0].text_content().encode('ascii', 'replace')
        return text_after_line_numbers(text)
    else:
        return '\n'.join(x.text_content() for x in doc.xpath('//tr/td[2]'))
コード例 #3
0
def ar_text_extractor(doc_source):
    return text_after_line_numbers(pdfdata_to_text(doc_source))
コード例 #4
0
def in_text_extractor(doc_source):
    text = pdfdata_to_text(doc_source)
    return text_after_line_numbers(text)
コード例 #5
0
def ia_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    text = doc.xpath('//pre')[0].text_content()
    # strip two sets of line numbers
    return text_after_line_numbers(text_after_line_numbers(text))
コード例 #6
0
def ak_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    text = doc.xpath('//pre')[0].text_content()
    text = text_after_line_numbers(text)
    return text
コード例 #7
0
def wi_text_extractor(mimetype, url, data):
    is_pdf = (mimetype == 'application/pdf' or
              url.endswith('.pdf'))
    if is_pdf:
        return text_after_line_numbers(pdfdata_to_text(data))
コード例 #8
0
def ut_text_extractor(mimetype, data):
    if mimetype == 'application/pdf':
        return text_after_line_numbers(pdfdata_to_text(data))
コード例 #9
0
def mo_text_extractor(doc_source):
    text = pdfdata_to_text(doc_source)
    return text_after_line_numbers(text).encode('ascii', 'ignore')
コード例 #10
0
def hi_text_extractor(mimetype, doc_source):
    if mimetype == 'application/pdf':
        return text_after_line_numbers(pdfdata_to_text(doc_source))
    else:
        return None