def finisher(the_record): """ POST finished chain to a callback URL provided """ db.session.add(the_record) verify_ssl = app.config['SSL_HOST_VALIDATION'] # Set the correct headers for the postback headers = {'Content-type': 'application/json', 'Accept': 'text/plain', 'Connection': 'close'} #proxy = {"http": "127.0.0.1:8080"} try: # Blacklist IP addresses ip_addr = socket.gethostbyname(grab_domain(the_record.url)) if app.config['IP_BLACKLISTING']: if netaddr.all_matching_cidrs(ip_addr, app.config['IP_BLACKLISTING_RANGE'].split(',')): the_record.capture_status = "IP BLACKLISTED:{} - ".format(ip_addr) + the_record.capture_status except: pass req = post(the_record.callback, verify=verify_ssl, data=json.dumps(the_record.as_dict()), headers=headers) # If a 4xx or 5xx status is recived, raise an exception req.raise_for_status() # Update capture_record and save to database the_record.job_status = 'COMPLETED' # Removed to propigate blacklist message #the_record.capture_status = 'CALLBACK_SUCCEEDED' db.session.add(the_record) db.session.commit()
def finisher(the_record): """ POST finished chain to a callback URL provided """ db.session.add(the_record) verify_ssl = app.config['SSL_HOST_VALIDATION'] # Set the correct headers for the postback headers = {'Content-type': 'application/json', 'Accept': 'text/plain', 'Connection': 'close'} #proxy = {"http": "127.0.0.1:8080"} try: # Blacklist IP addresses ip_addr = socket.gethostbyname(grab_domain(the_record.url)) if app.config['IP_BLACKLISTING']: if netaddr.all_matching_cidrs(ip_addr, app.config['IP_BLACKLISTING_RANGE'].split(',')): the_record.capture_status = "IP BLACKLISTED:{} - ".format(ip_addr) + the_record.capture_status except: pass req = post(the_record.callback, verify=verify_ssl, data=json.dumps(the_record.as_dict()), headers=headers) # If a 4xx or 5xx status is received, raise an exception req.raise_for_status() # Update capture_record and save to database the_record.job_status = 'COMPLETED' # Removed to propagate blacklist message #the_record.capture_status = 'CALLBACK_SUCCEEDED' db.session.add(the_record) db.session.commit()
def do_capture(status_code, the_record, base_url, model='capture'): """ Create a screenshot, text scrape, from a provided html file. This depends on phantomjs and an associated javascript file to perform the captures. In the event an error occurs, an exception is raised and handled by the celery task or the controller that called this method. """ # Make sure the the_record db.session.add(the_record) # If the capture is for static content, use a differnet PhantomJS config file if model == 'static': capture_name = the_record.filename service_args = [ app.config['PHANTOMJS'], '--ssl-protocol=any', '--ignore-ssl-errors=yes', os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/assets/static.js', app.config['LOCAL_STORAGE_FOLDER'], capture_name] content_to_parse = os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name) else: capture_name = grab_domain(the_record.url) + '_' + str(the_record.id) service_args = [ app.config['PHANTOMJS'], '--ssl-protocol=any', '--ignore-ssl-errors=yes', os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/assets/capture.js', the_record.url, os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name)] content_to_parse = os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name + '.html') # Using subprocess32 backport, call phantom and if process hangs kill it pid = subprocess32.Popen(service_args, stdout=PIPE, stderr=PIPE) try: stdout, stderr = pid.communicate(timeout=35) except subprocess32.TimeoutExpired: pid.kill() stdout, stderr = pid.communicate() app.logger.error('PhantomJS Static Capture timeout') raise Exception('PhantomJS Static Capture timeout') # If the subprocess has an error, raise an exception if stderr or stdout: raise Exception(stderr) # Strip tags and parse out all text ignore_tags = ('script', 'noscript', 'style') with open(content_to_parse, 'r') as content_file: content = content_file.read() cleaner = clean.Cleaner() content = cleaner.clean_html(content) doc = LH.fromstring(content) output = "" for elt in doc.iterdescendants(): if elt.tag in ignore_tags: continue text = elt.text or '' tail = elt.tail or '' wordz = " ".join((text, tail)).strip('\t') if wordz and len(wordz) >= 2 and not re.match("^[ \t\n]*$", wordz): output += wordz.encode('utf-8') # Since the filename format is different for static captures, update the filename # This will ensure the URLs are pointing to the correct resources if model == 'static': capture_name = capture_name.split('.')[0] # Wite our html text that was parsed into our capture folder parsed_text = open(os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name + '.txt'), 'wb') parsed_text.write(output) # Update the sketch record with the local URLs for the sketch, scrape, and html captures the_record.sketch_url = base_url + '/files/' + capture_name + '.png' the_record.scrape_url = base_url + '/files/' + capture_name + '.txt' the_record.html_url = base_url + '/files/' + capture_name + '.html' # Create a dict that contains what files may need to be written to S3 files_to_write = defaultdict(list) files_to_write['sketch'] = capture_name + '.png' files_to_write['scrape'] = capture_name + '.txt' files_to_write['html'] = capture_name + '.html' # If we are not writing to S3, update the capture_status that we are completed. if not app.config['USE_S3']: the_record.job_status = "COMPLETED" the_record.capture_status = "LOCAL_CAPTURES_CREATED" else: the_record.capture_status = "LOCAL_CAPTURES_CREATED" db.session.commit() return files_to_write
def do_capture(status_code, the_record, base_url, model="capture", phantomjs_timeout=app.config["PHANTOMJS_TIMEOUT"]): """ Create a screenshot, text scrape, from a provided html file. This depends on phantomjs and an associated javascript file to perform the captures. In the event an error occurs, an exception is raised and handled by the celery task or the controller that called this method. """ # Make sure the the_record db.session.add(the_record) # If the capture is for static content, use a differnet PhantomJS config file if model == "static": capture_name = the_record.filename service_args = [ app.config["PHANTOMJS"], "--ssl-protocol=any", "--ignore-ssl-errors=yes", os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/assets/static.js", app.config["LOCAL_STORAGE_FOLDER"], capture_name, ] content_to_parse = os.path.join(app.config["LOCAL_STORAGE_FOLDER"], capture_name) else: capture_name = grab_domain(the_record.url) + "_" + str(the_record.id) service_args = [ app.config["PHANTOMJS"], "--ssl-protocol=any", "--ignore-ssl-errors=yes", os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/assets/capture.js", the_record.url, os.path.join(app.config["LOCAL_STORAGE_FOLDER"], capture_name), ] content_to_parse = os.path.join(app.config["LOCAL_STORAGE_FOLDER"], capture_name + ".html") # Using subprocess32 backport, call phantom and if process hangs kill it pid = subprocess32.Popen(service_args, stdout=PIPE, stderr=PIPE) try: stdout, stderr = pid.communicate(timeout=phantomjs_timeout) except subprocess32.TimeoutExpired: pid.kill() stdout, stderr = pid.communicate() app.logger.error("PhantomJS Capture timeout at {} seconds".format(phantomjs_timeout)) raise subprocess32.TimeoutExpired("phantomjs capture", phantomjs_timeout) # If the subprocess has an error, raise an exception if stderr or stdout: raise Exception(stderr) # Strip tags and parse out all text ignore_tags = ("script", "noscript", "style") with open(content_to_parse, "r") as content_file: content = content_file.read() cleaner = clean.Cleaner() content = cleaner.clean_html(content) doc = LH.fromstring(content) output = "" for elt in doc.iterdescendants(): if elt.tag in ignore_tags: continue text = elt.text or "" tail = elt.tail or "" wordz = " ".join((text, tail)).strip("\t") if wordz and len(wordz) >= 2 and not re.match("^[ \t\n]*$", wordz): output += wordz.encode("utf-8") # Since the filename format is different for static captures, update the filename # This will ensure the URLs are pointing to the correct resources if model == "static": capture_name = capture_name.split(".")[0] # Wite our html text that was parsed into our capture folder parsed_text = open(os.path.join(app.config["LOCAL_STORAGE_FOLDER"], capture_name + ".txt"), "wb") parsed_text.write(output) # Update the sketch record with the local URLs for the sketch, scrape, and html captures the_record.sketch_url = base_url + "/files/" + capture_name + ".png" the_record.scrape_url = base_url + "/files/" + capture_name + ".txt" the_record.html_url = base_url + "/files/" + capture_name + ".html" # Create a dict that contains what files may need to be written to S3 files_to_write = defaultdict(list) files_to_write["sketch"] = capture_name + ".png" files_to_write["scrape"] = capture_name + ".txt" files_to_write["html"] = capture_name + ".html" # If we are not writing to S3, update the capture_status that we are completed. if not app.config["USE_S3"]: the_record.job_status = "COMPLETED" the_record.capture_status = "LOCAL_CAPTURES_CREATED" else: the_record.capture_status = "LOCAL_CAPTURES_CREATED" db.session.commit() return files_to_write
def celery_capture( self, status_code, base_url, capture_id=0, retries=0, model="capture", phantomjs_timeout=app.config["PHANTOMJS_TIMEOUT"], ): """ Celery task used to create sketch, scrape, html. Task also writes files to S3 or posts a callback depending on configuration file. """ capture_record = Capture.query.filter(Capture.id == capture_id).first() # Write the number of retries to the capture record db.session.add(capture_record) capture_record.retry = retries db.session.commit() try: # Check if we need to ignore certain hosts ip_addr = socket.gethostbyname(grab_domain(capture_record.url)) if app.config["IP_BLACKLISTING"]: if netaddr.all_matching_cidrs(ip_addr, app.config["IP_BLACKLISTING_RANGE"].split(",")): capture_record.capture_status = "IP BLACKLISTED:{}".format(ip_addr) if capture_record.callback: finisher(capture_record) else: capture_record.job_status = "COMPLETED" return True # Perform a callback or complete the task depending on error code and config if capture_record.url_response_code > 400 and app.config["CAPTURE_ERRORS"] == False: if capture_record.callback: finisher(capture_record) else: capture_record.job_status = "COMPLETED" return True # Only execute retries on ConnectionError exceptions, otherwise fail immediatley except ConnectionError as err: app.logger.error(err) capture_record.job_status = "RETRY" capture_record.capture_status = str(err) capture_record.retry = retries + 1 raise celery_capture.retry( args=[status_code, base_url], kwargs={"capture_id": capture_id, "retries": capture_record.retry + 1, "model": "capture"}, exc=err, countdown=app.config["COOLDOWN"], max_retries=app.config["MAX_RETRIES"], ) except Exception as err: app.logger.error(err) capture_record.job_status = "FAILURE" if str(err): capture_record.capture_status = str(err) capture_record.capture_status = str(err) finally: db.session.commit() # First perform the captures, then either write to S3, perform a callback, or neither try: # call the main capture function to retrieve sketches, scrapes, and html files_to_write = do_capture( status_code, capture_record, base_url, model="capture", phantomjs_timeout=phantomjs_timeout ) # Call the s3 save funciton if s3 is configured, and perform callback if configured. if app.config["USE_S3"]: if capture_record.callback: s3_save(files_to_write, capture_record) finisher(capture_record) else: s3_save(files_to_write, capture_record) elif capture_record.callback: finisher(capture_record) # If the screenshot generation timed out, try to render again except subprocess32.TimeoutExpired as err: app.logger.error(err) capture_record.job_status = "RETRY" capture_record.capture_status = str(err) capture_record.retry = retries + 1 raise celery_capture.retry( args=[status_code, base_url], kwargs={ "capture_id": capture_id, "retries": capture_record.retry, "model": "capture", "phantomjs_timeout": (capture_record.retry * 5) + phantomjs_timeout, }, exc=err, countdown=app.config["COOLDOWN"], max_retries=app.config["MAX_RETRIES"], ) # Retry on connection error exceptions except ConnectionError as err: app.logger.error(err) capture_record.job_status = "RETRY" capture_record.capture_status = str(err) capture_record.retry = retries + 1 raise celery_capture.retry( args=[status_code, base_url], kwargs={"capture_id": capture_id, "retries": capture_record.retry, "model": "capture"}, exc=err, countdown=app.config["COOLDOWN"], max_retries=app.config["MAX_RETRIES"], ) # For all other exceptions, fail immediately except Exception as err: app.logger.error(err) if str(err): capture_record.capture_status = str(err) capture_record.job_status = "FAILURE" raise Exception finally: db.session.commit()
def do_capture(status_code, the_record, base_url, model='capture'): """ Create a screenshot, text scrape, from a provided html file. This depends on phantomjs and an associated javascript file to perform the captures. In the event an error occurs, an exception is raised and handled by the celery task or the controller that called this method. """ # Make sure the the_record db.session.add(the_record) # If the capture is for static content, use a differnet PhantomJS config file if model == 'static': capture_name = the_record.filename service_args = [ app.config['PHANTOMJS'], '--ssl-protocol=any', '--ignore-ssl-errors=yes', os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/assets/static.js', app.config['LOCAL_STORAGE_FOLDER'], capture_name ] content_to_parse = os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name) else: capture_name = grab_domain(the_record.url) + '_' + str(the_record.id) service_args = [ app.config['PHANTOMJS'], '--ssl-protocol=any', '--ignore-ssl-errors=yes', os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/assets/capture.js', the_record.url, os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name) ] content_to_parse = os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name + '.html') # Using subprocess32 backport, call phantom and if process hangs kill it pid = subprocess32.Popen(service_args, stdout=PIPE, stderr=PIPE) try: stdout, stderr = pid.communicate(timeout=35) except subprocess32.TimeoutExpired: pid.kill() stdout, stderr = pid.communicate() app.logger.error('PhantomJS Static Capture timeout') raise Exception('PhantomJS Static Capture timeout') # If the subprocess has an error, raise an exception if stderr or stdout: raise Exception(stderr) # Strip tags and parse out all text ignore_tags = ('script', 'noscript', 'style') with open(content_to_parse, 'r') as content_file: content = content_file.read() cleaner = clean.Cleaner() content = cleaner.clean_html(content) doc = LH.fromstring(content) output = "" for elt in doc.iterdescendants(): if elt.tag in ignore_tags: continue text = elt.text or '' tail = elt.tail or '' wordz = " ".join((text, tail)).strip('\t') if wordz and len(wordz) >= 2 and not re.match("^[ \t\n]*$", wordz): output += wordz.encode('utf-8') # Since the filename format is different for static captures, update the filename # This will ensure the URLs are pointing to the correct resources if model == 'static': capture_name = capture_name.split('.')[0] # Wite our html text that was parsed into our capture folder parsed_text = open( os.path.join(app.config['LOCAL_STORAGE_FOLDER'], capture_name + '.txt'), 'wb') parsed_text.write(output) # Update the sketch record with the local URLs for the sketch, scrape, and html captures the_record.sketch_url = base_url + '/files/' + capture_name + '.png' the_record.scrape_url = base_url + '/files/' + capture_name + '.txt' the_record.html_url = base_url + '/files/' + capture_name + '.html' # Create a dict that contains what files may need to be written to S3 files_to_write = defaultdict(list) files_to_write['sketch'] = capture_name + '.png' files_to_write['scrape'] = capture_name + '.txt' files_to_write['html'] = capture_name + '.html' # If we are not writing to S3, update the capture_status that we are completed. if not app.config['USE_S3']: the_record.job_status = "COMPLETED" the_record.capture_status = "LOCAL_CAPTURES_CREATED" else: the_record.capture_status = "LOCAL_CAPTURES_CREATED" db.session.commit() return files_to_write
class Eager(Resource): """ Provides a way to retrieve a sketch, scrape, or html file eagerly (blocking call) Methods: GET Args: url = url to generate a text scrape. type = ['sketch', 'scrape', 'html'] string to specifiy capture tyope """ def get(self): """ Retrieve Capture based on id """ args = EAGERPARSER.parse_args() base_url = app.config['BASE_URL'] app.config.update(USE_S3='') # Parse out url and capture type capture_record = Capture() capture_record.url = args["url"] capture_type = args["type"] if capture_type not in ['html', 'sketch', 'scrape']: return 'Incorrect capture type specified: html, sketch, or scrape', 406 # Write to DB try: db.session.add(capture_record) db.session.commit() except IntegrityError, exc: return {"error": exc.message}, 500 # Refresh capture_record to obtain an ID for record db.session.refresh(capture_record) try: # dict of functions that generate capture names capture_names = { 'html': grab_domain(capture_record.url) + '_' + str(capture_record.id) + '.html', 'sketch': grab_domain(capture_record.url) + '_' + str(capture_record.id) + '.png', 'scrape': grab_domain(capture_record.url) + '_' + str(capture_record.id) + '.txt' } except: return 'This is not a valid URL', 406 # Check that url is valid and responsive if not check_url(capture_record): return 'Could not connect to URL', 406 # file_to_write is a placeholder in eager calls file_to_write = {} try: # Call eager_capture to create scrape, sketch, and html file (blocking) files_to_write = tasks.do_capture(200, capture_record, base_url) if capture_type in capture_names: return send_from_directory(app.config['LOCAL_STORAGE_FOLDER'], capture_names[capture_type], as_attachment=True) else: return 'Incorrect capture type specified: html, sketch, or scrape', 406 except Exception as err: # Consider updating capture_record status here app.logger.error(err) return str(err), 406
def celery_capture(self, status_code, base_url, capture_id=0, retries=0, model="capture", phantomjs_timeout=app.config['PHANTOMJS_TIMEOUT']): """ Celery task used to create sketch, scrape, html. Task also writes files to S3 or posts a callback depending on configuration file. """ capture_record = Capture.query.filter(Capture.id == capture_id).first() # Write the number of retries to the capture record db.session.add(capture_record) capture_record.retry = retries db.session.commit() try: # Check if we need to ignore certain hosts ip_addr = socket.gethostbyname(grab_domain(capture_record.url)) if app.config['IP_BLACKLISTING']: if netaddr.all_matching_cidrs( ip_addr, app.config['IP_BLACKLISTING_RANGE'].split(',')): capture_record.capture_status = "IP BLACKLISTED:{}".format( ip_addr) if capture_record.callback: finisher(capture_record) else: capture_record.job_status = 'COMPLETED' return True # Perform a callback or complete the task depending on error code and config if capture_record.url_response_code > 400 and app.config[ 'CAPTURE_ERRORS'] == False: if capture_record.callback: finisher(capture_record) else: capture_record.job_status = 'COMPLETED' return True # Only execute retries on ConnectionError exceptions, otherwise fail immediatley except ConnectionError as err: app.logger.error(err) capture_record.job_status = 'RETRY' capture_record.capture_status = str(err) capture_record.retry = retries + 1 raise celery_capture.retry(args=[status_code, base_url], kwargs={ 'capture_id': capture_id, 'retries': capture_record.retry + 1, 'model': 'capture' }, exc=err, countdown=app.config['COOLDOWN'], max_retries=app.config['MAX_RETRIES']) except Exception as err: app.logger.error(err) capture_record.job_status = 'FAILURE' if str(err): capture_record.capture_status = str(err) capture_record.capture_status = str(err) finally: db.session.commit() # First perform the captures, then either write to S3, perform a callback, or neither try: # call the main capture function to retrieve sketches, scrapes, and html files_to_write = do_capture(status_code, capture_record, base_url, model='capture', phantomjs_timeout=phantomjs_timeout) # Call the s3 save funciton if s3 is configured, and perform callback if configured. if app.config['USE_S3']: if capture_record.callback: s3_save(files_to_write, capture_record) finisher(capture_record) else: s3_save(files_to_write, capture_record) elif capture_record.callback: finisher(capture_record) # If the screenshot generation timed out, try to render again except subprocess32.TimeoutExpired as err: app.logger.error(err) capture_record.job_status = 'RETRY' capture_record.capture_status = str(err) capture_record.retry = retries + 1 raise celery_capture.retry( args=[status_code, base_url], kwargs={ 'capture_id': capture_id, 'retries': capture_record.retry, 'model': 'capture', 'phantomjs_timeout': (capture_record.retry * 5) + phantomjs_timeout }, exc=err, countdown=app.config['COOLDOWN'], max_retries=app.config['MAX_RETRIES']) # Retry on connection error exceptions except ConnectionError as err: app.logger.error(err) capture_record.job_status = 'RETRY' capture_record.capture_status = str(err) capture_record.retry = retries + 1 raise celery_capture.retry(args=[status_code, base_url], kwargs={ 'capture_id': capture_id, 'retries': capture_record.retry, 'model': 'capture' }, exc=err, countdown=app.config['COOLDOWN'], max_retries=app.config['MAX_RETRIES']) # For all other exceptions, fail immediately except Exception as err: app.logger.error(err) if str(err): capture_record.capture_status = str(err) capture_record.job_status = 'FAILURE' raise Exception finally: db.session.commit()