Esempio n. 1
0
def process(doc, url):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR)
            img_src = urljoin(url, img.get('src'))
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                write_file(r, fp)
            else:
                image = base64.b64decode(img_src.split(',')[1])
                fp.write(image)
            images.append(fp)
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {
                    'url': url
                }},
            )

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images, 1
Esempio n. 2
0
def download(file_url, SUPPORTED_MIME_TYPES):
    """
    Download/Export file from google drive

    params:
        file_url: file url from dropbox
    """

    outfp = tempfile.NamedTemporaryFile()

    # TODO: verify url
    r = requests.get(file_url, stream=True, headers=DEFAULT_HEADERS)
    mime_type = r.headers["content-type"]
    write_file(r, outfp)
    return outfp, mime_type
Esempio n. 3
0
def download(file_url, SUPPORTED_MIME_TYPES, exception=None):
    """
    Download/Export file from google drive

    params:
        file_url: file url from dropbox
    """

    outfp = tempfile.NamedTemporaryFile()

    # TODO: verify url
    r = requests.get(file_url, stream=True, headers=DEFAULT_HEADERS)
    mime_type = r.headers["content-type"]

    if mime_type in SUPPORTED_MIME_TYPES:
        write_file(r, outfp)
        return outfp, mime_type

    if exception:
        raise exception('Unsupported Mime Type: ' + mime_type)
Esempio n. 4
0
    def __init__(self, url):

        type = HTML
        doc = None
        params = {'url': url}

        try:
            r = requests.head(url, headers=DEFAULT_HEADERS, verify=False)
        except requests.exceptions.RequestException:
            # If we can't get header, assume html and try to continue.
            r = requests.get(url, headers=DEFAULT_HEADERS, verify=False)
            doc = r.content
            super().__init__(doc, type, params=params)
            return

        if not r.headers.get('content-type') or \
                any(x in r.headers["content-type"] for x in self.HTML_TYPES):
            doc = get_web_info_extractor(url).get_content()
        else:
            fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR,
                                             delete=False)
            r = requests.get(url,
                             stream=True,
                             headers=DEFAULT_HEADERS,
                             verify=False)
            write_file(r, fp)

            doc = fp
            if any(x in r.headers["content-type"] for x in self.PDF_TYPES):
                type = PDF

            elif any(x in r.headers["content-type"] for x in self.DOCX_TYPES):
                type = DOCX

            elif any(x in r.headers["content-type"] for x in self.PPTX_TYPES):
                type = PPTX

        super().__init__(doc, type, params=params)
Esempio n. 5
0
def process(doc):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR)
            r = requests.get(img.get('src'), stream=True)
            write_file(r, fp)
            images.append(fp)
        except Exception:
            pass

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images
Esempio n. 6
0
    def __init__(self, url):

        type = HTML
        doc = None

        try:
            r = requests.head(url, headers=DEFAULT_HEADERS)
        except requests.exceptions.RequestException:
            # If we can't get header, assume html and try to continue.
            r = requests.get(url, headers=DEFAULT_HEADERS)
            doc = r.content
            super().__init__(doc, type)
            return

        if not r.headers.get('content-type') or \
                any(x in r.headers["content-type"] for x in self.HTML_TYPES):
            r = requests.get(url, headers=DEFAULT_HEADERS)
            doc = r.content
        else:
            fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR)
            r = requests.get(url, stream=True, headers=DEFAULT_HEADERS)
            write_file(r, fp)

            doc = fp
            if any(x in r.headers["content-type"]
                   for x in self.PDF_TYPES):
                type = PDF

            elif any(x in r.headers["content-type"]
                     for x in self.DOCX_TYPES):
                type = DOCX

            elif any(x in r.headers["content-type"]
                     for x in self.PPTX_TYPES):
                type = PPTX

        super().__init__(doc, type)