Esempio n. 1
0
def finisher(the_record):
    """
    POST finished chain to a callback URL provided
    """
    db.session.add(the_record)
    verify_ssl = app.config['SSL_HOST_VALIDATION']
    # Set the correct headers for the postback
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain', 'Connection': 'close'}
    #proxy = {"http": "127.0.0.1:8080"}

    try:
        # Blacklist IP addresses
        ip_addr = socket.gethostbyname(grab_domain(the_record.url))

        if app.config['IP_BLACKLISTING']:
            if netaddr.all_matching_cidrs(ip_addr, app.config['IP_BLACKLISTING_RANGE'].split(',')):
                the_record.capture_status = "IP BLACKLISTED:{} - ".format(ip_addr) + the_record.capture_status
    except:
        pass

    req = post(the_record.callback, verify=verify_ssl, data=json.dumps(the_record.as_dict()), headers=headers)

    # If a 4xx or 5xx status is recived, raise an exception
    req.raise_for_status()

    # Update capture_record and save to database
    the_record.job_status = 'COMPLETED'
    # Removed to propigate blacklist message
    #the_record.capture_status = 'CALLBACK_SUCCEEDED'
    db.session.add(the_record)
    db.session.commit()
Esempio n. 2
0
def finisher(the_record):
    """
    POST finished chain to a callback URL provided
    """
    db.session.add(the_record)
    verify_ssl = app.config['SSL_HOST_VALIDATION']
    # Set the correct headers for the postback
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain', 'Connection': 'close'}
    #proxy = {"http": "127.0.0.1:8080"}

    try:
        # Blacklist IP addresses
        ip_addr = socket.gethostbyname(grab_domain(the_record.url))

        if app.config['IP_BLACKLISTING']:
            if netaddr.all_matching_cidrs(ip_addr, app.config['IP_BLACKLISTING_RANGE'].split(',')):
                the_record.capture_status = "IP BLACKLISTED:{} - ".format(ip_addr) + the_record.capture_status
    except:
        pass

    req = post(the_record.callback, verify=verify_ssl, data=json.dumps(the_record.as_dict()), headers=headers)

    # If a 4xx or 5xx status is received, raise an exception
    req.raise_for_status()

    # Update capture_record and save to database
    the_record.job_status = 'COMPLETED'
    # Removed to propagate blacklist message
    #the_record.capture_status = 'CALLBACK_SUCCEEDED'
    db.session.add(the_record)
    db.session.commit()
Esempio n. 3
0
def do_capture(status_code, the_record, base_url, model='capture'):
    """
    Create a screenshot, text scrape, from a provided html file.

    This depends on phantomjs and an associated javascript file to perform the captures.
    In the event an error occurs, an exception is raised and handled by the celery task
    or the controller that called this method.
    """
    # Make sure the the_record
    db.session.add(the_record)
    # If the capture is for static content, use a differnet PhantomJS config file
    if model == 'static':
        capture_name = the_record.filename
        service_args = [
            app.config['PHANTOMJS'],
            '--ssl-protocol=any',
            '--ignore-ssl-errors=yes',
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/assets/static.js',
            app.config['LOCAL_STORAGE_FOLDER'],
            capture_name]
        content_to_parse = os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name)
    else:
        capture_name = grab_domain(the_record.url) + '_' + str(the_record.id)
        service_args = [
            app.config['PHANTOMJS'],
            '--ssl-protocol=any',
            '--ignore-ssl-errors=yes',
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/assets/capture.js',
            the_record.url,
        os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name)]

        content_to_parse = os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name + '.html')
    # Using subprocess32 backport, call phantom and if process hangs kill it
    pid = subprocess32.Popen(service_args, stdout=PIPE, stderr=PIPE)
    try:

        stdout, stderr = pid.communicate(timeout=35)
    except subprocess32.TimeoutExpired:
        pid.kill()
        stdout, stderr = pid.communicate()
        app.logger.error('PhantomJS Static Capture timeout')
        raise Exception('PhantomJS Static Capture timeout')

    # If the subprocess has an error, raise an exception
    if stderr or stdout:
        raise Exception(stderr)

    # Strip tags and parse out all text
    ignore_tags = ('script', 'noscript', 'style')
    with open(content_to_parse, 'r') as content_file:
        content = content_file.read()
    cleaner = clean.Cleaner()
    content = cleaner.clean_html(content)
    doc = LH.fromstring(content)
    output = ""
    for elt in doc.iterdescendants():
        if elt.tag in ignore_tags:
            continue
        text = elt.text or ''
        tail = elt.tail or ''
        wordz = " ".join((text, tail)).strip('\t')
        if wordz and len(wordz) >= 2 and not re.match("^[ \t\n]*$", wordz):
            output += wordz.encode('utf-8')
   
    # Since the filename format is different for static captures, update the filename
    # This will ensure the URLs are pointing to the correct resources
    if model == 'static':
        capture_name = capture_name.split('.')[0]
        
    # Wite our html text that was parsed into our capture folder
    parsed_text = open(os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name + '.txt'), 'wb')
    parsed_text.write(output)

    # Update the sketch record with the local URLs for the sketch, scrape, and html captures
    the_record.sketch_url = base_url + '/files/' + capture_name + '.png'
    the_record.scrape_url = base_url + '/files/' + capture_name + '.txt'
    the_record.html_url = base_url + '/files/' + capture_name + '.html'

    # Create a dict that contains what files may need to be written to S3
    files_to_write = defaultdict(list)
    files_to_write['sketch'] = capture_name + '.png'
    files_to_write['scrape'] = capture_name + '.txt'
    files_to_write['html'] = capture_name + '.html'

    # If we are not writing to S3, update the capture_status that we are completed.
    if not app.config['USE_S3']:
        the_record.job_status = "COMPLETED"
        the_record.capture_status = "LOCAL_CAPTURES_CREATED"
    else:
        the_record.capture_status = "LOCAL_CAPTURES_CREATED"
    db.session.commit()
    return files_to_write
Esempio n. 4
0
def do_capture(status_code, the_record, base_url, model="capture", phantomjs_timeout=app.config["PHANTOMJS_TIMEOUT"]):
    """
    Create a screenshot, text scrape, from a provided html file.

    This depends on phantomjs and an associated javascript file to perform the captures.
    In the event an error occurs, an exception is raised and handled by the celery task
    or the controller that called this method.
    """
    # Make sure the the_record
    db.session.add(the_record)
    # If the capture is for static content, use a differnet PhantomJS config file
    if model == "static":
        capture_name = the_record.filename
        service_args = [
            app.config["PHANTOMJS"],
            "--ssl-protocol=any",
            "--ignore-ssl-errors=yes",
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/assets/static.js",
            app.config["LOCAL_STORAGE_FOLDER"],
            capture_name,
        ]
        content_to_parse = os.path.join(app.config["LOCAL_STORAGE_FOLDER"], capture_name)
    else:
        capture_name = grab_domain(the_record.url) + "_" + str(the_record.id)
        service_args = [
            app.config["PHANTOMJS"],
            "--ssl-protocol=any",
            "--ignore-ssl-errors=yes",
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/assets/capture.js",
            the_record.url,
            os.path.join(app.config["LOCAL_STORAGE_FOLDER"], capture_name),
        ]

        content_to_parse = os.path.join(app.config["LOCAL_STORAGE_FOLDER"], capture_name + ".html")
    # Using subprocess32 backport, call phantom and if process hangs kill it
    pid = subprocess32.Popen(service_args, stdout=PIPE, stderr=PIPE)
    try:
        stdout, stderr = pid.communicate(timeout=phantomjs_timeout)
    except subprocess32.TimeoutExpired:
        pid.kill()
        stdout, stderr = pid.communicate()
        app.logger.error("PhantomJS Capture timeout at {} seconds".format(phantomjs_timeout))
        raise subprocess32.TimeoutExpired("phantomjs capture", phantomjs_timeout)

    # If the subprocess has an error, raise an exception
    if stderr or stdout:
        raise Exception(stderr)

    # Strip tags and parse out all text
    ignore_tags = ("script", "noscript", "style")
    with open(content_to_parse, "r") as content_file:
        content = content_file.read()
    cleaner = clean.Cleaner()
    content = cleaner.clean_html(content)
    doc = LH.fromstring(content)
    output = ""
    for elt in doc.iterdescendants():
        if elt.tag in ignore_tags:
            continue
        text = elt.text or ""
        tail = elt.tail or ""
        wordz = " ".join((text, tail)).strip("\t")
        if wordz and len(wordz) >= 2 and not re.match("^[ \t\n]*$", wordz):
            output += wordz.encode("utf-8")

    # Since the filename format is different for static captures, update the filename
    # This will ensure the URLs are pointing to the correct resources
    if model == "static":
        capture_name = capture_name.split(".")[0]

    # Wite our html text that was parsed into our capture folder
    parsed_text = open(os.path.join(app.config["LOCAL_STORAGE_FOLDER"], capture_name + ".txt"), "wb")
    parsed_text.write(output)

    # Update the sketch record with the local URLs for the sketch, scrape, and html captures
    the_record.sketch_url = base_url + "/files/" + capture_name + ".png"
    the_record.scrape_url = base_url + "/files/" + capture_name + ".txt"
    the_record.html_url = base_url + "/files/" + capture_name + ".html"

    # Create a dict that contains what files may need to be written to S3
    files_to_write = defaultdict(list)
    files_to_write["sketch"] = capture_name + ".png"
    files_to_write["scrape"] = capture_name + ".txt"
    files_to_write["html"] = capture_name + ".html"

    # If we are not writing to S3, update the capture_status that we are completed.
    if not app.config["USE_S3"]:
        the_record.job_status = "COMPLETED"
        the_record.capture_status = "LOCAL_CAPTURES_CREATED"
    else:
        the_record.capture_status = "LOCAL_CAPTURES_CREATED"
    db.session.commit()
    return files_to_write
Esempio n. 5
0
def celery_capture(
    self,
    status_code,
    base_url,
    capture_id=0,
    retries=0,
    model="capture",
    phantomjs_timeout=app.config["PHANTOMJS_TIMEOUT"],
):
    """
    Celery task used to create sketch, scrape, html.
    Task also writes files to S3 or posts a callback depending on configuration file.
    """
    capture_record = Capture.query.filter(Capture.id == capture_id).first()
    # Write the number of retries to the capture record
    db.session.add(capture_record)
    capture_record.retry = retries
    db.session.commit()

    try:
        # Check if we need to ignore certain hosts
        ip_addr = socket.gethostbyname(grab_domain(capture_record.url))

        if app.config["IP_BLACKLISTING"]:
            if netaddr.all_matching_cidrs(ip_addr, app.config["IP_BLACKLISTING_RANGE"].split(",")):
                capture_record.capture_status = "IP BLACKLISTED:{}".format(ip_addr)
                if capture_record.callback:
                    finisher(capture_record)
                else:
                    capture_record.job_status = "COMPLETED"
                return True
        # Perform a callback or complete the task depending on error code and config
        if capture_record.url_response_code > 400 and app.config["CAPTURE_ERRORS"] == False:
            if capture_record.callback:
                finisher(capture_record)
            else:
                capture_record.job_status = "COMPLETED"
            return True
    # Only execute retries on ConnectionError exceptions, otherwise fail immediatley
    except ConnectionError as err:
        app.logger.error(err)
        capture_record.job_status = "RETRY"
        capture_record.capture_status = str(err)
        capture_record.retry = retries + 1
        raise celery_capture.retry(
            args=[status_code, base_url],
            kwargs={"capture_id": capture_id, "retries": capture_record.retry + 1, "model": "capture"},
            exc=err,
            countdown=app.config["COOLDOWN"],
            max_retries=app.config["MAX_RETRIES"],
        )
    except Exception as err:
        app.logger.error(err)
        capture_record.job_status = "FAILURE"
        if str(err):
            capture_record.capture_status = str(err)
        capture_record.capture_status = str(err)
    finally:
        db.session.commit()
    # First perform the captures, then either write to S3, perform a callback, or neither
    try:
        # call the main capture function to retrieve sketches, scrapes, and html
        files_to_write = do_capture(
            status_code, capture_record, base_url, model="capture", phantomjs_timeout=phantomjs_timeout
        )
        # Call the s3 save funciton if s3 is configured, and perform callback if configured.
        if app.config["USE_S3"]:
            if capture_record.callback:
                s3_save(files_to_write, capture_record)
                finisher(capture_record)
            else:
                s3_save(files_to_write, capture_record)
        elif capture_record.callback:
            finisher(capture_record)
    # If the screenshot generation timed out, try to render again
    except subprocess32.TimeoutExpired as err:
        app.logger.error(err)
        capture_record.job_status = "RETRY"
        capture_record.capture_status = str(err)
        capture_record.retry = retries + 1
        raise celery_capture.retry(
            args=[status_code, base_url],
            kwargs={
                "capture_id": capture_id,
                "retries": capture_record.retry,
                "model": "capture",
                "phantomjs_timeout": (capture_record.retry * 5) + phantomjs_timeout,
            },
            exc=err,
            countdown=app.config["COOLDOWN"],
            max_retries=app.config["MAX_RETRIES"],
        )
    # Retry on connection error exceptions
    except ConnectionError as err:
        app.logger.error(err)
        capture_record.job_status = "RETRY"
        capture_record.capture_status = str(err)
        capture_record.retry = retries + 1
        raise celery_capture.retry(
            args=[status_code, base_url],
            kwargs={"capture_id": capture_id, "retries": capture_record.retry, "model": "capture"},
            exc=err,
            countdown=app.config["COOLDOWN"],
            max_retries=app.config["MAX_RETRIES"],
        )
    # For all other exceptions, fail immediately
    except Exception as err:
        app.logger.error(err)
        if str(err):
            capture_record.capture_status = str(err)
        capture_record.job_status = "FAILURE"
        raise Exception
    finally:
        db.session.commit()
Esempio n. 6
0
def do_capture(status_code, the_record, base_url, model='capture'):
    """
    Create a screenshot, text scrape, from a provided html file.

    This depends on phantomjs and an associated javascript file to perform the captures.
    In the event an error occurs, an exception is raised and handled by the celery task
    or the controller that called this method.
    """
    # Make sure the the_record
    db.session.add(the_record)
    # If the capture is for static content, use a differnet PhantomJS config file
    if model == 'static':
        capture_name = the_record.filename
        service_args = [
            app.config['PHANTOMJS'], '--ssl-protocol=any',
            '--ignore-ssl-errors=yes',
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
            '/assets/static.js', app.config['LOCAL_STORAGE_FOLDER'],
            capture_name
        ]
        content_to_parse = os.path.join(app.config['LOCAL_STORAGE_FOLDER'],
                                        capture_name)
    else:
        capture_name = grab_domain(the_record.url) + '_' + str(the_record.id)
        service_args = [
            app.config['PHANTOMJS'], '--ssl-protocol=any',
            '--ignore-ssl-errors=yes',
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
            '/assets/capture.js', the_record.url,
            os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name)
        ]

        content_to_parse = os.path.join(app.config['LOCAL_STORAGE_FOLDER'],
                                        capture_name + '.html')
    # Using subprocess32 backport, call phantom and if process hangs kill it
    pid = subprocess32.Popen(service_args, stdout=PIPE, stderr=PIPE)
    try:

        stdout, stderr = pid.communicate(timeout=35)
    except subprocess32.TimeoutExpired:
        pid.kill()
        stdout, stderr = pid.communicate()
        app.logger.error('PhantomJS Static Capture timeout')
        raise Exception('PhantomJS Static Capture timeout')

    # If the subprocess has an error, raise an exception
    if stderr or stdout:
        raise Exception(stderr)

    # Strip tags and parse out all text
    ignore_tags = ('script', 'noscript', 'style')
    with open(content_to_parse, 'r') as content_file:
        content = content_file.read()
    cleaner = clean.Cleaner()
    content = cleaner.clean_html(content)
    doc = LH.fromstring(content)
    output = ""
    for elt in doc.iterdescendants():
        if elt.tag in ignore_tags:
            continue
        text = elt.text or ''
        tail = elt.tail or ''
        wordz = " ".join((text, tail)).strip('\t')
        if wordz and len(wordz) >= 2 and not re.match("^[ \t\n]*$", wordz):
            output += wordz.encode('utf-8')

    # Since the filename format is different for static captures, update the filename
    # This will ensure the URLs are pointing to the correct resources
    if model == 'static':
        capture_name = capture_name.split('.')[0]

    # Wite our html text that was parsed into our capture folder
    parsed_text = open(
        os.path.join(app.config['LOCAL_STORAGE_FOLDER'],
                     capture_name + '.txt'), 'wb')
    parsed_text.write(output)

    # Update the sketch record with the local URLs for the sketch, scrape, and html captures
    the_record.sketch_url = base_url + '/files/' + capture_name + '.png'
    the_record.scrape_url = base_url + '/files/' + capture_name + '.txt'
    the_record.html_url = base_url + '/files/' + capture_name + '.html'

    # Create a dict that contains what files may need to be written to S3
    files_to_write = defaultdict(list)
    files_to_write['sketch'] = capture_name + '.png'
    files_to_write['scrape'] = capture_name + '.txt'
    files_to_write['html'] = capture_name + '.html'

    # If we are not writing to S3, update the capture_status that we are completed.
    if not app.config['USE_S3']:
        the_record.job_status = "COMPLETED"
        the_record.capture_status = "LOCAL_CAPTURES_CREATED"
    else:
        the_record.capture_status = "LOCAL_CAPTURES_CREATED"
    db.session.commit()
    return files_to_write
Esempio n. 7
0
class Eager(Resource):
    """
    Provides a way to retrieve a sketch, scrape, or html file eagerly (blocking call)

    Methods:
    GET

    Args:
    url = url to generate a text scrape.
    type = ['sketch', 'scrape', 'html'] string to specifiy capture tyope
    """
    def get(self):
        """
        Retrieve Capture based on id
        """
        args = EAGERPARSER.parse_args()
        base_url = app.config['BASE_URL']

        app.config.update(USE_S3='')
        # Parse out url and capture type
        capture_record = Capture()
        capture_record.url = args["url"]
        capture_type = args["type"]

        if capture_type not in ['html', 'sketch', 'scrape']:
            return 'Incorrect capture type specified: html, sketch, or scrape', 406

        # Write to DB
        try:
            db.session.add(capture_record)
            db.session.commit()
        except IntegrityError, exc:
            return {"error": exc.message}, 500

        # Refresh capture_record to obtain an ID for record
        db.session.refresh(capture_record)

        try:
            # dict of functions that generate capture names
            capture_names = {
                'html':
                grab_domain(capture_record.url) + '_' +
                str(capture_record.id) + '.html',
                'sketch':
                grab_domain(capture_record.url) + '_' +
                str(capture_record.id) + '.png',
                'scrape':
                grab_domain(capture_record.url) + '_' +
                str(capture_record.id) + '.txt'
            }
        except:
            return 'This is not a valid URL', 406

        # Check that url is valid and responsive
        if not check_url(capture_record):
            return 'Could not connect to URL', 406

        # file_to_write is a placeholder in eager calls
        file_to_write = {}

        try:
            # Call eager_capture to create scrape, sketch, and html file (blocking)
            files_to_write = tasks.do_capture(200, capture_record, base_url)

            if capture_type in capture_names:
                return send_from_directory(app.config['LOCAL_STORAGE_FOLDER'],
                                           capture_names[capture_type],
                                           as_attachment=True)
            else:
                return 'Incorrect capture type specified: html, sketch, or scrape', 406
        except Exception as err:
            # Consider updating capture_record status here
            app.logger.error(err)
            return str(err), 406
Esempio n. 8
0
def celery_capture(self,
                   status_code,
                   base_url,
                   capture_id=0,
                   retries=0,
                   model="capture",
                   phantomjs_timeout=app.config['PHANTOMJS_TIMEOUT']):
    """
    Celery task used to create sketch, scrape, html.
    Task also writes files to S3 or posts a callback depending on configuration file.
    """
    capture_record = Capture.query.filter(Capture.id == capture_id).first()
    # Write the number of retries to the capture record
    db.session.add(capture_record)
    capture_record.retry = retries
    db.session.commit()

    try:
        # Check if we need to ignore certain hosts
        ip_addr = socket.gethostbyname(grab_domain(capture_record.url))

        if app.config['IP_BLACKLISTING']:
            if netaddr.all_matching_cidrs(
                    ip_addr, app.config['IP_BLACKLISTING_RANGE'].split(',')):
                capture_record.capture_status = "IP BLACKLISTED:{}".format(
                    ip_addr)
                if capture_record.callback:
                    finisher(capture_record)
                else:
                    capture_record.job_status = 'COMPLETED'
                return True
        # Perform a callback or complete the task depending on error code and config
        if capture_record.url_response_code > 400 and app.config[
                'CAPTURE_ERRORS'] == False:
            if capture_record.callback:
                finisher(capture_record)
            else:
                capture_record.job_status = 'COMPLETED'
            return True
    # Only execute retries on ConnectionError exceptions, otherwise fail immediatley
    except ConnectionError as err:
        app.logger.error(err)
        capture_record.job_status = 'RETRY'
        capture_record.capture_status = str(err)
        capture_record.retry = retries + 1
        raise celery_capture.retry(args=[status_code, base_url],
                                   kwargs={
                                       'capture_id': capture_id,
                                       'retries': capture_record.retry + 1,
                                       'model': 'capture'
                                   },
                                   exc=err,
                                   countdown=app.config['COOLDOWN'],
                                   max_retries=app.config['MAX_RETRIES'])
    except Exception as err:
        app.logger.error(err)
        capture_record.job_status = 'FAILURE'
        if str(err):
            capture_record.capture_status = str(err)
        capture_record.capture_status = str(err)
    finally:
        db.session.commit()
    # First perform the captures, then either write to S3, perform a callback, or neither
    try:
        # call the main capture function to retrieve sketches, scrapes, and html
        files_to_write = do_capture(status_code,
                                    capture_record,
                                    base_url,
                                    model='capture',
                                    phantomjs_timeout=phantomjs_timeout)
        # Call the s3 save funciton if s3 is configured, and perform callback if configured.
        if app.config['USE_S3']:
            if capture_record.callback:
                s3_save(files_to_write, capture_record)
                finisher(capture_record)
            else:
                s3_save(files_to_write, capture_record)
        elif capture_record.callback:
            finisher(capture_record)
    # If the screenshot generation timed out, try to render again
    except subprocess32.TimeoutExpired as err:
        app.logger.error(err)
        capture_record.job_status = 'RETRY'
        capture_record.capture_status = str(err)
        capture_record.retry = retries + 1
        raise celery_capture.retry(
            args=[status_code, base_url],
            kwargs={
                'capture_id': capture_id,
                'retries': capture_record.retry,
                'model': 'capture',
                'phantomjs_timeout':
                (capture_record.retry * 5) + phantomjs_timeout
            },
            exc=err,
            countdown=app.config['COOLDOWN'],
            max_retries=app.config['MAX_RETRIES'])
    # Retry on connection error exceptions
    except ConnectionError as err:
        app.logger.error(err)
        capture_record.job_status = 'RETRY'
        capture_record.capture_status = str(err)
        capture_record.retry = retries + 1
        raise celery_capture.retry(args=[status_code, base_url],
                                   kwargs={
                                       'capture_id': capture_id,
                                       'retries': capture_record.retry,
                                       'model': 'capture'
                                   },
                                   exc=err,
                                   countdown=app.config['COOLDOWN'],
                                   max_retries=app.config['MAX_RETRIES'])
    # For all other exceptions, fail immediately
    except Exception as err:
        app.logger.error(err)
        if str(err):
            capture_record.capture_status = str(err)
        capture_record.job_status = 'FAILURE'
        raise Exception
    finally:
        db.session.commit()