def stop(self, session: Session, success: bool = False): """ Stop the statification process :param session : the database session :param success : a boolean to know if the process has been finish successfully or not, by default it's unsuccessful """ # if success is True and there is an object Statification linked to the process if not success: try: # get the statification with empty commit statification = Statification.get_statification(session, '') # Delete the current Statification Object with all the linked object statification.delete(session) self.logger.info( 'There was a current statification, it has been deleted, process will continue normally' ) except (NoResultFound, IndexError): self.logger.info( 'There is no current statification, process will continue normally' ) # open the pid file in read mode try: f_pid_file = open(self.s_pid_file) # read the pid s_pid = f_pid_file.read() f_pid_file.close() if not s_pid == "": # get an Integer i_pid = int(s_pid) try: # kill the process with the pid os.kill(i_pid, signal.SIGTERM) os.kill(i_pid, signal.SIGINT) # wait for the process to terminate while True: try: # check if the process has been terminated os.kill(i_pid, 0) except OSError: self.logger.info( 'The process stopped successfully') # continue when the process has been stopped break except ProcessLookupError as e: # if the process was already stopped we do nothing it should be normal if the process was # correctly terminated self.logger.debug('The process was already stopped' + str(e)) except FileNotFoundError as e: # if the process was already stopped we do nothing it should be normal if the process was correctly # terminated self.logger.debug('The process was already stopped' + str(e)) finally: # erase the content of the pid File and create it empty if it doesn't exist yet f_pid_file = open(self.s_pid_file, 'w') f_pid_file.close()
def service_get_satif_list(i_limit: int, i_skip: int, s_order: str) -> Dict[str, Any]: """ Get the list of statifications requested with the following parameters : @param i_limit: number of statification to request @param i_skip: number of statification to skip @param s_order: name of the colum to sort the statification by @return a python dict containing the list of the statifications returned by the request """ order = Statification.id # get the Statification attribute corresponding to the column to order by if s_order == 'cre_date': order = Statification.cre_date elif s_order == 'upd_date': order = Statification.upd_date elif s_order == 'designation': order = Statification.designation elif s_order == 'status': order = Statification.status # get the first 'limit' since the 'skip' statifications, if there is less return all a_statifications = Statification.get_n_list_statifications(current_app.session, i_limit, i_skip, order, 'desc') return { 'statifications': a_statifications }
def service_get_statif_count() -> Dict[str, int]: """ Get the number of statifications in the database. @return the number of statifications """ return { 'count': Statification.get_count(current_app.session) }
def register_error_in_database(self, session: Session): """ This methode create database object associated to the statification with the result of the log that scrapy has generated. :param session :raise NoResultFound if there is no statification with empty commit sha """ # finalization of the statification by removing unwanted files and directories and empty directories self.delete_files() self.delete_directories() self.delete_empty_directories() # get the statification with empty commit statification = Statification.get_statification(session, '') # open the log file that contain scrapy errors f_file = open(self.s_log_file) expecting_other_line_for_error_message = False s_error_message = '' # for each line will look for information that will be used to fill object of the database for line in f_file: # check if the line contain a warning or a info if re.match('(.*)WARNING(.*)', line) or re.match( '(.*)INFO(.*)', line) or re.match('(.*) ERROR:(.*)', line): expecting_other_line_for_error_message = False if expecting_other_line_for_error_message: s_error_message += line if (not expecting_other_line_for_error_message ) and s_error_message != '': statification.add_object_to_statification( ScrapyError, session, s_error_message) s_error_message = '' # in the case the line match an External link if re.match('(.*) INFO: External link detected(.*)', line): # we get the second part of the line there are also [] in the first part s_trunked_line = line[line.index('INFO: External link detected' ):len(line)] # we get the position of begining of the URL i_start_url = s_trunked_line.index('[') # we ge the position of the end of the URL i_end_url = s_trunked_line.index(']') # we get the position of the begining of the source url i_start_source = s_trunked_line.index(' in ') + 4 try: # we create and add a new ExtenalLink to our statification statification.add_object_to_statification( ExternalLink, session, s_trunked_line[i_start_source:len(s_trunked_line)], s_trunked_line[i_start_url + 1:i_end_url]) except ValueError as e: self.logger.info(e) # in the case the line match a Scrapy Error elif re.match('(.*) ERROR:(.*)', line): expecting_other_line_for_error_message = True # retrieve the Scrapy Error s_trunked_line = line[line.index('ERROR: ') + 7:len(line)] s_error_message += s_trunked_line # in the case the line match an error for type MIME elif re.match('(.*) WARNING: Forbidden content (.*)', line): # we get the second part of the line where begin the information that interest us s_trunked_line = line[line.index('WARNING: Forbidden content ' ):len(line)] # get the starting position of the Error type MIME i_start_error_mime = s_trunked_line.index('[') # get the end position of the error type MIME i_end_error_mime = s_trunked_line.index(']') # get the error type MIME s_error_mime = s_trunked_line[i_start_error_mime + 1:i_end_error_mime] # get the source of the error s_source_link = s_trunked_line[s_trunked_line. index('detected in') + 12:len(s_trunked_line)] try: # create an ErrorTypeMIME associated to the statification statification.add_object_to_statification( ErrorTypeMIME, session, s_error_mime, s_source_link) except ValueError as e: self.logger.info(e) # in the case the line match an HTTP error elif re.match('(.*) WARNING: HTTP error (.*)', line): # we get the second part of the line where begin the information that interest us s_trunked_line = line[line.index('WARNING: HTTP error ' ):len(line)] # we get the starting position of the Error Code i_start_error_code = s_trunked_line.index('[') # we get the end position of the Error Code i_end_error_code = s_trunked_line.index(']') # we get the start position of the url source of the error i_start_url = s_trunked_line.index(' for ') # we get the end position of the url source of the error i_end_url = s_trunked_line.index(' from ') # we retrieve the error code s_error_code = s_trunked_line[i_start_error_code + 1:i_end_error_code] # we retrieve the url that cause the error s_url = s_trunked_line[i_start_url + 5:i_end_url] # we retrieve the url of the source where was found the url that caused the error s_url_source = s_trunked_line[i_end_url + 6:len(s_trunked_line) - 1] try: # we create a new HtmlError associated to the statification statification.add_object_to_statification( HtmlError, session, s_error_code, s_url, s_url_source) except ValueError as e: self.logger.info(e) elif re.match('(.*)response_received_count(.*)', line): # we get the second part of the line where begin the information that interest us s_value_item_scraped_count = line[line.index(': ') + 2:line.index(',')] try: # set the number of crawled item into the statification object statification.upd_nb_item(session, statification.commit, int(s_value_item_scraped_count)) except ValueError as e: self.logger.info(e) try: # retrieve the list of type file with number of file for each type s_result_type_files = sh.uniq( sh.sort( sh.grep( sh.find(sh.glob(self.s_repository_path + '/*'), '-type', 'f'), '-o', '-E', '\.[a-zA-Z0-9]+$')), '-c') # the result is a string so we need to get a table, # here we get a table made of each line returned, we remove all space a_table_result_type_files = s_result_type_files.replace( ' ', '').split('\n') # browse the line of result for row in a_table_result_type_files: if row: # a line is composed of a number followed by a type like "42.png", # we separate the number and the type s_type_file = row.split('.') try: # create a new ScannedFile associated to the statificaiton statification.add_object_to_statification( ScannedFile, session, s_type_file[1], int(s_type_file[0])) except ValueError as e: self.logger.info(e) except sh.ErrorReturnCode_1: self.logger.info('There is no folder in the static repository') finally: # in all case we need to close the file f_file.close() # change the status of the statification (NEED TO BE DONE AT THE END !!) statification.upd_status(session, '', Status.STATIFIED)
def start(self, session: Session, s_designation: str, s_description: str, s_user: str): """ Start a statification process with scrapy. :param session: the database session :param s_designation: the designation of the new statification :param s_description: the description of the new statification :param s_user: the name of the user which started the operation :raise ValueError if one parameter is missing """ if self.s_repository_path and os.path.isdir( self.s_repository_path) and self.s_urls and self.s_domains: try: # get the statification with empty commit statification = Statification.get_statification(session, '') # Delete the current Statification Object with all the linked object statification.delete(session) self.logger.info( 'There was a current statification, it has been deleted, process will continue normally' ) except NoResultFound: self.logger.info( 'There is no current statification, process will continue normally' ) self.logger.info("Create Statification") # create a new statification with empty commit ID statification = Statification('', s_designation, s_description, datetime.utcnow(), datetime.utcnow(), Status.CREATED) session.add(statification) session.commit() # create a StatificationHistoric statification.add_object_to_statification( StatificationHistoric, session, datetime.utcnow(), s_user, Actions.CREATE_STATIFICATION) # erase the precedent log file f_log_file = open(self.s_log_file, "w") f_log_file.close() try: # create a new environnement to call subprocess new_env = os.environ.copy() new_env["PYTHONPATH"] = self.s_python_path # create a subprocess that will run scrapy in background process = sh.python3('scrapy_cmd.py', 'crawl', '--loglevel=INFO', '--logfile=' + self.s_log_file, '-a', 'output=' + self.s_repository_path, '-a', 'urls="' + self.s_urls + '"', '-a', 'domains="' + self.s_domains + '"', '-a', 'url_regex="' + self.s_url_regex + '"', '-a', 'url_replacement="' + self.s_url_replacement + '"', '-a', 'crawler_count_file=' + self.s_crawler_progress_counter_file, 'mirroring', _cwd=self.s_project_directory, _env=new_env, _bg=True, _tty_out=False, _done=self.done) # create the pid file if it doesn't exist, erase the file if it exist f_pid_file = open(self.s_pid_file, "w") # write the new process pid in the file f_pid_file.write(str(process.pid)) f_pid_file.close() except sh.ErrorReturnCode_1 as e: self.logger.info(str(e)) else: raise ValueError( "Verify your parameter it seems that one is empty or that one file doesn't exist" )
def service_get_last_statif_infos() -> Dict[str, Any]: """ Get the last statification information. The following information will be returned in a python dict : - sha : a string that contain the sha of the last statification, if the last is a new and unsaved statification it will be empty - designation : the designation of the last statification, or empty - description : the description of the last statification, or empty - status : the status of the last statification : CREATED = 0 STATIFIED = 1 SAVED = 2 PRODUCTION = 3 VISUALIZED = 4 Default status will be 3, if there is no statification in the database the user will still be able to create a new one, if there are ongoing statification to be push to prod it still give the hand to the user that have saved it. - i_nb_item_to_crawl : the number of item that have been crawled during the last statification, it will be used as a reference of the number of items to crawl to the next statification. @return a python dict containing the above information { sha, designation, description, status, i_nb_item_to_crawl } """ # initialize the number of item to crawl to 0 i_nb_item_to_crawl = 0 # initialize sha to empty value sha = '' # set designation and description as empty designation = '' description = '' # default status will be 3 (PRODUCTION), if there is no statification in the database the user will still be able # to create a new one, if there are ongoing statification to be push to prod it still give the hand to # the user that have saved it. status = 3 # get the 2 last statification that have been created, when the statification process will start the last # statification to be created will be the current one (unsaved) last_statification = Statification.get_n_list_statifications(current_app.session, 2, 0, Statification.id, 'desc') # check if the list of statification isn't empty if len(last_statification) != 0: if len(last_statification) > 1: # get the number of item crawled in the previous statification (preceding the new one created) i_nb_item_to_crawl = last_statification[1]['nb_item'] # get the status and the sha of the last statification status = last_statification[0]['status'] sha = last_statification[0]['sha'] # if the last statification is a new one that has not been saved if sha == '': designation = last_statification[0]['designation'] description = last_statification[0]['description'] return { 'sha': sha, 'designation': designation, 'description': description, 'status': status, 'i_nb_item_to_crawl': i_nb_item_to_crawl }
def service_get_statif_info(s_archive_sha: str) -> Dict[str, Any]: """ Get the statification information for the given sha, with all the data included in associated objects. For the current statification, it will return a python dict containing : - statification : the data of object statification - errors_type_mime : the list of errors type mime - external_links : the list of external links - html_errors : the list of html errors - scanned_files : the list of scanned files - scrapy_errors : the list of scrapy errors - statification_historics : the list of statification historics. @return a python dict containing all the information of the current statification """ # initialize empty variable s_statification = None a_errors_type_mime = None a_external_links = None a_html_errors = None a_scanned_files = None a_scrapy_errors = None a_statification_historic = None try: # verify that the sha is valid if not , if it is empty then if s_archive_sha != '': validate_sha(s_archive_sha, current_app.config['ARCHIVE_REPOSITORY']) try: # get the statification corresponding to the sha and the associated objects statification = Statification.get_statification(current_app.session, s_archive_sha) # get the JSON from this statification s_statification = statification.get_dict() # get other objects associated to the statification and their JSON try: a_errors_type_mime = statification.get_list_from_class(ErrorTypeMIME, current_app.session) except NoResultFound as e: current_app.logger.info(e) try: a_external_links = statification.get_list_from_class(ExternalLink, current_app.session) except NoResultFound as e: current_app.logger.info(e) try: a_html_errors = statification.get_list_from_class(HtmlError, current_app.session) except NoResultFound as e: current_app.logger.info(e) try: a_scanned_files = statification.get_list_from_class(ScannedFile, current_app.session) except NoResultFound as e: current_app.logger.info(e) try: a_scrapy_errors = statification.get_list_from_class(ScrapyError, current_app.session) except NoResultFound as e: current_app.logger.info(e) try: a_statification_historic = statification.get_list_from_class(StatificationHistoric, current_app.session) except NoResultFound as e: current_app.logger.info(e) except NoResultFound as e: current_app.logger.info(e) # Return a python dict with all information return { 'statification': s_statification, 'errors_type_mime': a_errors_type_mime, 'external_links': a_external_links, 'html_errors': a_html_errors, 'scanned_files': a_scanned_files, 'scrapy_errors': a_scrapy_errors, 'statification_historics': a_statification_historic } except SyntaxError as e: current_app.logger.error(e) # return an error code if the sha is not valid return { 'success': False, 'error': 'sha_unvalid' } except sh.ErrorReturnCode: # if an error has occurred during a subprocess return { 'success': False, 'error': 'system_fail' }
def visualize_done(cmd: str, success: bool, exit_code: int): """ That method is called after the git.pull process has finish :param cmd: the string containing the command that lauch git.pull :param success: a boolean , true if success false otherwise :param exit_code: the exit code that was return by the command :raise RuntimeError """ # create a session for this specific code , because it's executed after the flask instance has been killed session = open_session_db(s_database_uri) if not success: raise RuntimeError('Operation was unsuccessful : ' + cmd) try: # reinitialize the repository before checking out the commit service_do_init_statif(s_visualize_repository, s_url_git) logger.info('> Checkout the visualized commit') git = sh.git.bake(_cwd=s_visualize_repository, _tty_out=False) # checkout the commit on the archive repository for line in git.checkout('-q', s_commit, _iter=True): logger.info(line) logger.info( '> Change status for last visualized statification to saved') try: # change the status of the previous Visualized statification to Saved status Statification.switch_status(session, Status.VISUALIZED, Status.SAVED) except (ValueError, NoResultFound) as e: # in the case there was no statification that had the status VISUALIZED before, we just catch the error # and we continue as normal logger.info(str(e)) logger.info('> Change status for new visualized statification') # Now the statification is on the visualize repository so we change the status from default to visualized Statification.upd_status(session, s_commit, Status.VISUALIZED) # update the date of update of the statification Statification.upd_upd_date(session, s_commit, datetime.utcnow()) # create a StatificationHistoric to keep track of the modification Statification.static_add_object_to_statification( StatificationHistoric, session, s_commit, datetime.utcnow(), s_user, Actions.VISUALIZE_STATIFICATION) # on success write a success code write_status_background( { 'success': True, 'operation': 'visualize', 'commit': s_commit }, s_file_status_background) except (ValueError, NoResultFound) as e: # there is no reason the process execute the code here logger.error(str(e)) write_status_background( { 'success': False, 'error': 'database', 'operation': 'visualize' }, s_file_status_background) except RuntimeError as e: logger.error(str(e)) write_status_background( { 'success': False, 'error': 'subprocess', 'operation': 'visualize' }, s_file_status_background) except sh.ErrorReturnCode as e: logger.error(str(e)) # if an error has happened during a subprocess write_status_background( { 'success': False, 'error': 'subprocess', 'operation': 'visualize' }, s_file_status_background) finally: unlock_access(s_lock_file)
def do_apply_prod_done(cmd: str, success: bool, exit_code: int): """ That method is called after the git.branch process has finish :param cmd: the string containing the command that lauch git.push :param success: a boolean , true if success false otherwise :param exit_code: the exit code that was return by the command :raise RuntimeError """ # create a session for this specific code , because it's executed after the flask instance has been killed session = open_session_db(s_database_uri) if not success: raise RuntimeError('Operation was unsuccessful : ' + cmd) # Select the git repository git = sh.git.bake(_cwd=s_static_repository, _tty_out=False) logger.info('> Push the statification to production server(s)') for url in urls_git_prod: logger.info('> Push statification to server : ' + url) # push the commit to production server for log in git.push('-f', url, 'production', _iter='out'): logger.info(log) logger.info( '> Change status of the last statification push to prod to saved') # catch error if there is some try: # change the status of the previous put in Production statification to SAVED status Statification.switch_status(session, Status.PRODUCTION, Status.SAVED) except (ValueError, NoResultFound) as e: # in the case there was no statification that had the status PRODUCTION before, we just catch the error # and we continue as normal logger.info(str(e)) logger.info('> Change status of the new statification to push to prod') try: # change status of the statification pushed to prod to PRODUCTION Statification.upd_status(session, s_commit, Status.PRODUCTION) # update the date of update of the statification Statification.upd_upd_date(session, s_commit, datetime.utcnow()) # create a StatificationHistoric to keep track of the modification Statification.static_add_object_to_statification( StatificationHistoric, session, s_commit, datetime.utcnow(), s_user, Actions.PUSHTOPROD_STATIFICATION) logger.info('> Push to prod operations terminated') write_status_background( { 'success': True, 'operation': 'pushtoprod', 'commit': s_commit }, s_file_status_background) except RuntimeError as e: logger.error(str(e)) write_status_background( { 'success': False, 'error': 'subprocess', 'operation': 'pushtoprod' }, s_file_status_background) except (ValueError, NoResultFound) as e: # there is no reason the process execute the code here logger.error(str(e)) write_status_background( { 'success': False, 'error': 'database', 'operation': 'pushtoprod' }, s_file_status_background) finally: # always unlock the route unlock_access(s_lock_file)
def commit_done(cmd: str, success: bool, exit_code: int): """ That method is called after the git.push process has finish :param cmd: the string containing the command that lauch git.push :param success: a boolean , true if success false otherwise :param exit_code: the exit code that was return by the command :raise RuntimeError """ # create a session for this specific code , because it's executed after the flask instance has been killed session = open_session_db(s_database_uri) if not success: raise RuntimeError('Operation was unsuccessful : ' + cmd) logger.info('> Rename log file with commit sha') try: # rename the logfile of the statification by the commit SHA os.rename(s_log_file, s_log_dir + "/" + s_commit + ".log") logger.info('> Register Commit into the database') # update the current statification with no commit with the new commit sha Statification.upd_commit(session, '', s_commit) # Now the statification is on git so we change the status from statified to SAVED Statification.upd_status(session, s_commit, Status.SAVED) # update the date of update of the statification Statification.upd_upd_date(session, s_commit, datetime.utcnow()) # create a StatificationHistoric to keep track of the modification Statification.static_add_object_to_statification( StatificationHistoric, session, s_commit, datetime.utcnow(), s_user, Actions.COMMIT_STATIFICATION) logger.info('> Commit operations terminated') # on success write a success code and the commit id write_status_background( { 'success': True, 'commit': s_commit, 'operation': 'commit' }, s_file_status_background) except RuntimeError as e: logger.error(str(e)) write_status_background( { 'success': False, 'error': 'subprocess', 'operation': 'commit' }, s_file_status_background) except FileNotFoundError as e: logger.error(str(e)) write_status_background( { 'success': False, 'error': 'log_file', 'operation': 'commit' }, s_file_status_background) except (ValueError, NoResultFound) as e: logger.error(str(e)) # if no statification was found for the given commit # write an error in the statusBackground file write_status_background( { 'success': False, 'error': 'database', 'operation': 'commit' }, s_file_status_background) finally: # always unlock the route unlock_access(s_lock_file)