Beispiel #1
0
def extract_text(doc, data):
    if doc['mimetype'] == 'text/html':
        doc = lxml.html.fromstring(data)
        text = doc.xpath('//div[@class="Section2"]')[0].text_content()
        return text
    else:
        return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #2
0
def extract_text(doc, data):
    if doc['mimetype'] == 'text/html':
        doc = lxml.html.fromstring(data)
        text = doc.xpath('//div[@class="Section2"]')[0].text_content()
        return text
    else:
        return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #3
0
def extract_text(oyster_doc, data):
    doc = lxml.html.fromstring(data)
    pre = doc.xpath('//pre')
    if pre:
        text = pre[0].text_content().encode('ascii', 'replace')
        return text_after_line_numbers(text)
    else:
        return '\n'.join(x.text_content() for x in doc.xpath('//tr/td[2]'))
Beispiel #4
0
def extract_text(doc, data):
    doc = lxml.html.fromstring(data)
    pre = doc.xpath("//pre")
    if pre:
        text = pre[0].text_content().encode("ascii", "replace")
        return text_after_line_numbers(text)
    else:
        return "\n".join(x.text_content() for x in doc.xpath("//tr/td[2]"))
Beispiel #5
0
def extract_text(doc, data):
    doc = lxml.html.fromstring(data)
    pre = doc.xpath('//pre')
    if pre:
        text = pre[0].text_content().encode('ascii', 'replace')
        return text_after_line_numbers(text)
    else:
        return '\n'.join(x.text_content() for x in doc.xpath('//tr/td[2]'))
Beispiel #6
0
def extract_text(doc, data):
    if doc['mimetype'] == 'application/pdf':
        return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #7
0
def extract_text(doc, data):
    return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #8
0
def extract_text(doc, data):
    text = pdfdata_to_text(data)
    return text_after_line_numbers(text)
Beispiel #9
0
def extract_text(oyster_doc, data):
    if oyster_doc['metadata']['mimetype'] == 'application/pdf':
        return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #10
0
def extract_text(doc, data):
    is_pdf = (doc['mimetype'] == 'application/pdf' or
              doc['url'].endswith('.pdf'))
    if is_pdf:
        return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #11
0
def extract_text(doc, data):
    is_pdf = (doc['mimetype'] == 'application/pdf'
              or doc['url'].endswith('.pdf'))
    if is_pdf:
        return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #12
0
def extract_text(doc, data):
    text = pdfdata_to_text(data)
    return text_after_line_numbers(text).encode('ascii', 'ignore')
Beispiel #13
0
def extract_text(oyster_doc, data):
    doc = lxml.html.fromstring(data)
    text = doc.xpath('//pre')[0].text_content()
    text = text_after_line_numbers(text)
    return text
Beispiel #14
0
def extract_text(doc, data):
    is_pdf = doc["mimetype"] == "application/pdf" or doc["url"].endswith(".pdf")
    if is_pdf:
        return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #15
0
def extract_text(doc, data):
    text = pdfdata_to_text(data)
    return text_after_line_numbers(text)
Beispiel #16
0
def extract_text(oyster_doc, data):
    return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #17
0
def extract_text(doc, data):
    text = pdfdata_to_text(data)
    return text_after_line_numbers(text).encode('ascii', 'ignore')
Beispiel #18
0
def extract_text(doc, data):
    doc = lxml.html.fromstring(data)
    text = doc.xpath('//pre')[0].text_content()
    # strip two sets of line numbers
    return text_after_line_numbers(text_after_line_numbers(text))
Beispiel #19
0
def extract_text(doc, data):
    if doc["mimetype"] == "application/pdf":
        return text_after_line_numbers(pdfdata_to_text(data))
Beispiel #20
0
def extract_text(doc, data):
    doc = lxml.html.fromstring(data)
    text = doc.xpath('//pre')[0].text_content()
    # strip two sets of line numbers
    return text_after_line_numbers(text_after_line_numbers(text))
Beispiel #21
0
def extract_text(oyster_doc, data):
    doc = lxml.html.fromstring(data)
    text = doc.xpath('//pre')[0].text_content()
    text = text_after_line_numbers(text)
    return text