def stop(self, session: Session, success: bool = False):
        """
        Stop the statification process
        :param session : the database session
        :param success : a boolean to know if the process has been finish successfully or not,
                         by default it's unsuccessful
        """

        # if success is True and there is an object Statification linked to the process
        if not success:
            try:
                # get the statification with empty commit
                statification = Statification.get_statification(session, '')
                # Delete the current Statification Object with all the linked object
                statification.delete(session)
                self.logger.info(
                    'There was a current statification, it has been deleted, process will continue normally'
                )
            except (NoResultFound, IndexError):
                self.logger.info(
                    'There is no current statification, process will continue normally'
                )

        # open the pid file in read mode
        try:
            f_pid_file = open(self.s_pid_file)
            # read the pid
            s_pid = f_pid_file.read()
            f_pid_file.close()
            if not s_pid == "":
                # get an Integer
                i_pid = int(s_pid)
                try:
                    # kill the process with the pid
                    os.kill(i_pid, signal.SIGTERM)
                    os.kill(i_pid, signal.SIGINT)

                    # wait for the process to terminate
                    while True:
                        try:
                            # check if the process has been terminated
                            os.kill(i_pid, 0)
                        except OSError:
                            self.logger.info(
                                'The process stopped successfully')
                            # continue when the process has been stopped
                            break
                except ProcessLookupError as e:
                    # if the process was already stopped we do nothing it should be normal if the process was
                    # correctly terminated
                    self.logger.debug('The process was already stopped' +
                                      str(e))
        except FileNotFoundError as e:
            # if the process was already stopped we do nothing it should be normal if the process was correctly
            # terminated
            self.logger.debug('The process was already stopped' + str(e))
        finally:
            # erase the content of the pid File and create it empty if it doesn't exist yet
            f_pid_file = open(self.s_pid_file, 'w')
            f_pid_file.close()
Example #2
0
def service_get_satif_list(i_limit: int, i_skip: int, s_order: str) -> Dict[str, Any]:
    """
    Get the list of statifications requested with the following parameters :
    @param i_limit: number of statification to request
    @param i_skip: number of statification to skip
    @param s_order: name of the colum to sort the statification by
    @return a python dict containing the list of the statifications returned by the request
    """
    order = Statification.id

    # get the Statification attribute corresponding to the column to order by
    if s_order == 'cre_date':
        order = Statification.cre_date
    elif s_order == 'upd_date':
        order = Statification.upd_date
    elif s_order == 'designation':
        order = Statification.designation
    elif s_order == 'status':
        order = Statification.status

    # get the first 'limit' since the 'skip' statifications, if there is less return all
    a_statifications = Statification.get_n_list_statifications(current_app.session, i_limit, i_skip, order, 'desc')

    return {
        'statifications': a_statifications
    }
Example #3
0
def service_get_statif_count() -> Dict[str, int]:
    """
    Get the number of statifications in the database.
    @return the number of statifications
    """
    return {
        'count': Statification.get_count(current_app.session)
    }
    def register_error_in_database(self, session: Session):
        """
        This methode create database object associated to the statification with the result of the log
        that scrapy has generated.
        :param session
        :raise NoResultFound if there is no statification with empty commit sha
        """

        # finalization of the statification by removing unwanted files and directories and empty directories
        self.delete_files()
        self.delete_directories()
        self.delete_empty_directories()

        # get the statification with empty commit
        statification = Statification.get_statification(session, '')

        # open the log file that contain scrapy errors
        f_file = open(self.s_log_file)

        expecting_other_line_for_error_message = False
        s_error_message = ''

        # for each line will look for information that will be used to fill object of the database
        for line in f_file:

            # check if the line contain a warning or a info
            if re.match('(.*)WARNING(.*)', line) or re.match(
                    '(.*)INFO(.*)', line) or re.match('(.*) ERROR:(.*)', line):
                expecting_other_line_for_error_message = False

            if expecting_other_line_for_error_message:
                s_error_message += line

            if (not expecting_other_line_for_error_message
                ) and s_error_message != '':
                statification.add_object_to_statification(
                    ScrapyError, session, s_error_message)
                s_error_message = ''

            # in the case the line match an External link
            if re.match('(.*) INFO: External link detected(.*)', line):
                # we get the second part of the line there are also [] in the first part
                s_trunked_line = line[line.index('INFO: External link detected'
                                                 ):len(line)]

                # we get the position of begining of the URL
                i_start_url = s_trunked_line.index('[')
                # we ge the position of the end of the URL
                i_end_url = s_trunked_line.index(']')
                # we get the position of the begining of the source url
                i_start_source = s_trunked_line.index(' in ') + 4

                try:
                    # we create and add a new ExtenalLink to our statification
                    statification.add_object_to_statification(
                        ExternalLink, session,
                        s_trunked_line[i_start_source:len(s_trunked_line)],
                        s_trunked_line[i_start_url + 1:i_end_url])
                except ValueError as e:
                    self.logger.info(e)
            # in the case the line match a Scrapy Error
            elif re.match('(.*) ERROR:(.*)', line):
                expecting_other_line_for_error_message = True
                # retrieve the Scrapy Error
                s_trunked_line = line[line.index('ERROR: ') + 7:len(line)]
                s_error_message += s_trunked_line

            # in the case the line match an error for type MIME
            elif re.match('(.*) WARNING: Forbidden content (.*)', line):

                # we get the second part of the line where begin the information that interest us
                s_trunked_line = line[line.index('WARNING: Forbidden content '
                                                 ):len(line)]

                # get the starting position of the Error type MIME
                i_start_error_mime = s_trunked_line.index('[')
                # get the end position of the error type MIME
                i_end_error_mime = s_trunked_line.index(']')
                # get the error type MIME
                s_error_mime = s_trunked_line[i_start_error_mime +
                                              1:i_end_error_mime]
                # get the source of the error
                s_source_link = s_trunked_line[s_trunked_line.
                                               index('detected in') +
                                               12:len(s_trunked_line)]

                try:
                    # create an ErrorTypeMIME associated to the statification
                    statification.add_object_to_statification(
                        ErrorTypeMIME, session, s_error_mime, s_source_link)
                except ValueError as e:
                    self.logger.info(e)
            # in the case the line match an HTTP error
            elif re.match('(.*) WARNING: HTTP error (.*)', line):

                # we get the second part of the line where begin the information that interest us
                s_trunked_line = line[line.index('WARNING: HTTP error '
                                                 ):len(line)]

                # we get the starting position of the Error Code
                i_start_error_code = s_trunked_line.index('[')
                # we get the end position of the Error Code
                i_end_error_code = s_trunked_line.index(']')
                # we get the start position of the url source of the error
                i_start_url = s_trunked_line.index(' for ')
                # we get the end position of the url source of the error
                i_end_url = s_trunked_line.index(' from ')

                # we retrieve the error code
                s_error_code = s_trunked_line[i_start_error_code +
                                              1:i_end_error_code]

                # we retrieve the url that cause the error
                s_url = s_trunked_line[i_start_url + 5:i_end_url]

                # we retrieve the url of the source where was found the url that caused the error
                s_url_source = s_trunked_line[i_end_url +
                                              6:len(s_trunked_line) - 1]

                try:
                    # we create a new HtmlError associated to the statification
                    statification.add_object_to_statification(
                        HtmlError, session, s_error_code, s_url, s_url_source)
                except ValueError as e:
                    self.logger.info(e)
            elif re.match('(.*)response_received_count(.*)', line):

                # we get the second part of the line where begin the information that interest us
                s_value_item_scraped_count = line[line.index(': ') +
                                                  2:line.index(',')]

                try:
                    # set the number of crawled item into the statification object
                    statification.upd_nb_item(session, statification.commit,
                                              int(s_value_item_scraped_count))
                except ValueError as e:
                    self.logger.info(e)
        try:
            # retrieve the list of type file with number of file for each type
            s_result_type_files = sh.uniq(
                sh.sort(
                    sh.grep(
                        sh.find(sh.glob(self.s_repository_path + '/*'),
                                '-type', 'f'), '-o', '-E', '\.[a-zA-Z0-9]+$')),
                '-c')
            # the result is a string so we need to get a table,
            # here we get a table made of each line returned, we remove all space
            a_table_result_type_files = s_result_type_files.replace(
                ' ', '').split('\n')

            # browse the line of result
            for row in a_table_result_type_files:
                if row:
                    # a line is composed of a number followed by a type like "42.png",
                    # we separate the number and the type
                    s_type_file = row.split('.')

                    try:
                        # create a new ScannedFile associated to the statificaiton
                        statification.add_object_to_statification(
                            ScannedFile, session, s_type_file[1],
                            int(s_type_file[0]))
                    except ValueError as e:
                        self.logger.info(e)
        except sh.ErrorReturnCode_1:
            self.logger.info('There is no folder in the static repository')
        finally:
            # in all case we need to close the file
            f_file.close()

        # change the status of the statification (NEED TO BE DONE AT THE END !!)
        statification.upd_status(session, '', Status.STATIFIED)
    def start(self, session: Session, s_designation: str, s_description: str,
              s_user: str):
        """
        Start a statification process with scrapy.
        :param session: the database session
        :param s_designation:  the designation of the new statification
        :param s_description: the description of the new statification
        :param s_user: the name of the user which started the operation
        :raise ValueError if one parameter is missing
        """

        if self.s_repository_path and os.path.isdir(
                self.s_repository_path) and self.s_urls and self.s_domains:

            try:
                # get the statification with empty commit
                statification = Statification.get_statification(session, '')
                # Delete the current Statification Object with all the linked object
                statification.delete(session)
                self.logger.info(
                    'There was a current statification, it has been deleted, process will continue normally'
                )
            except NoResultFound:
                self.logger.info(
                    'There is no current statification, process will continue normally'
                )

            self.logger.info("Create Statification")

            # create a new statification with empty commit ID
            statification = Statification('', s_designation, s_description,
                                          datetime.utcnow(), datetime.utcnow(),
                                          Status.CREATED)
            session.add(statification)
            session.commit()

            # create a StatificationHistoric
            statification.add_object_to_statification(
                StatificationHistoric, session, datetime.utcnow(), s_user,
                Actions.CREATE_STATIFICATION)

            # erase the precedent log file
            f_log_file = open(self.s_log_file, "w")
            f_log_file.close()

            try:
                # create a new environnement to call subprocess
                new_env = os.environ.copy()
                new_env["PYTHONPATH"] = self.s_python_path

                # create a subprocess that will run scrapy in background
                process = sh.python3('scrapy_cmd.py',
                                     'crawl',
                                     '--loglevel=INFO',
                                     '--logfile=' + self.s_log_file,
                                     '-a',
                                     'output=' + self.s_repository_path,
                                     '-a',
                                     'urls="' + self.s_urls + '"',
                                     '-a',
                                     'domains="' + self.s_domains + '"',
                                     '-a',
                                     'url_regex="' + self.s_url_regex + '"',
                                     '-a',
                                     'url_replacement="' +
                                     self.s_url_replacement + '"',
                                     '-a',
                                     'crawler_count_file=' +
                                     self.s_crawler_progress_counter_file,
                                     'mirroring',
                                     _cwd=self.s_project_directory,
                                     _env=new_env,
                                     _bg=True,
                                     _tty_out=False,
                                     _done=self.done)

                # create the pid file if it doesn't exist, erase the file if it exist
                f_pid_file = open(self.s_pid_file, "w")
                # write the new process pid in the file
                f_pid_file.write(str(process.pid))
                f_pid_file.close()

            except sh.ErrorReturnCode_1 as e:
                self.logger.info(str(e))

        else:
            raise ValueError(
                "Verify your parameter it seems that one is empty or that one file doesn't exist"
            )
Example #6
0
def service_get_last_statif_infos() -> Dict[str, Any]:
    """
    Get the last statification information. The following information will be returned in a python dict :
    -   sha            :   a string that contain the sha of the last statification,
                            if the last is a new and unsaved statification it will be empty
    -   designation       :   the designation of the last statification, or empty
    -   description       :   the description of the last statification, or empty
    -   status            :   the status of the last statification :  CREATED = 0
                                                                    STATIFIED = 1
                                                                    SAVED = 2
                                                                    PRODUCTION = 3
                                                                    VISUALIZED = 4
                            Default status will be 3, if there is no statification in the database the user will still
                            be able to create a new one, if there are ongoing statification to be push to prod it still
                            give the hand to the user that have saved it.
    - i_nb_item_to_crawl  : the number of item that have been crawled during the last statification, it will be used
                            as a reference of the number of items to crawl to the next statification.

    @return a python dict containing the above information
             {
                sha,
                designation,
                description,
                status,
                i_nb_item_to_crawl
             }
    """
    # initialize the number of item to crawl to 0
    i_nb_item_to_crawl = 0
    # initialize sha to empty value
    sha = ''
    # set designation and description as empty
    designation = ''
    description = ''
    # default status will be 3 (PRODUCTION), if there is no statification in the database the user will still be able
    # to create a new one, if there are ongoing statification to be push to prod it still give the hand to
    # the user that have saved it.
    status = 3

    # get the 2 last statification that have been created, when the statification process will start the last
    # statification to be created will be the current one (unsaved)
    last_statification = Statification.get_n_list_statifications(current_app.session, 2, 0, Statification.id, 'desc')

    # check if the list of statification isn't empty
    if len(last_statification) != 0:

        if len(last_statification) > 1:
            # get the number of item crawled in the previous statification (preceding the new one created)
            i_nb_item_to_crawl = last_statification[1]['nb_item']

        # get the status and the sha of the last statification
        status = last_statification[0]['status']
        sha = last_statification[0]['sha']

        # if the last statification is a new one that has not been saved
        if sha == '':
            designation = last_statification[0]['designation']
            description = last_statification[0]['description']

    return {
        'sha': sha,
        'designation': designation,
        'description': description,
        'status': status,
        'i_nb_item_to_crawl': i_nb_item_to_crawl
    }
Example #7
0
def service_get_statif_info(s_archive_sha: str) -> Dict[str, Any]:
    """
    Get the statification information for the given sha, with all the data included in associated objects.
    For the current statification, it will return a python dict containing :
      -  statification : the data of object statification
      -  errors_type_mime : the list of errors type mime
      -  external_links : the list of external links
      -  html_errors : the list of html errors
      -  scanned_files : the list of scanned files
      -  scrapy_errors : the list of scrapy errors
      -  statification_historics : the list of statification historics.

    @return a python dict containing all the information of the current statification
    """
    # initialize empty variable
    s_statification = None
    a_errors_type_mime = None
    a_external_links = None
    a_html_errors = None
    a_scanned_files = None
    a_scrapy_errors = None
    a_statification_historic = None

    try:
        # verify that the sha is valid if not , if it is empty then
        if s_archive_sha != '':
            validate_sha(s_archive_sha, current_app.config['ARCHIVE_REPOSITORY'])

        try:
            # get the statification corresponding to the sha and the associated objects
            statification = Statification.get_statification(current_app.session, s_archive_sha)
            # get the JSON from this statification
            s_statification = statification.get_dict()

            # get other objects associated to the statification and their JSON
            try:
                a_errors_type_mime = statification.get_list_from_class(ErrorTypeMIME, current_app.session)
            except NoResultFound as e:
                current_app.logger.info(e)
            try:
                a_external_links = statification.get_list_from_class(ExternalLink, current_app.session)
            except NoResultFound as e:
                current_app.logger.info(e)
            try:
                a_html_errors = statification.get_list_from_class(HtmlError, current_app.session)
            except NoResultFound as e:
                current_app.logger.info(e)
            try:
                a_scanned_files = statification.get_list_from_class(ScannedFile, current_app.session)
            except NoResultFound as e:
                current_app.logger.info(e)
            try:
                a_scrapy_errors = statification.get_list_from_class(ScrapyError, current_app.session)
            except NoResultFound as e:
                current_app.logger.info(e)
            try:
                a_statification_historic = statification.get_list_from_class(StatificationHistoric, current_app.session)
            except NoResultFound as e:
                current_app.logger.info(e)
        except NoResultFound as e:
            current_app.logger.info(e)

        # Return a python dict with all information
        return {
            'statification': s_statification,
            'errors_type_mime': a_errors_type_mime,
            'external_links': a_external_links,
            'html_errors': a_html_errors,
            'scanned_files': a_scanned_files,
            'scrapy_errors': a_scrapy_errors,
            'statification_historics': a_statification_historic
        }
    except SyntaxError as e:
        current_app.logger.error(e)
        # return an error code if the sha is not valid
        return {
            'success': False,
            'error': 'sha_unvalid'
        }
    except sh.ErrorReturnCode:
        # if an error has occurred during a subprocess
        return {
            'success': False,
            'error': 'system_fail'
        }
Example #8
0
    def visualize_done(cmd: str, success: bool, exit_code: int):
        """
        That method is called after the git.pull process has finish
        :param cmd: the string containing the command that lauch git.pull
        :param success: a boolean , true if success false otherwise
        :param exit_code: the exit code that was return by the command
        :raise RuntimeError
        """
        # create a session for this specific code , because it's executed after the flask instance has been killed
        session = open_session_db(s_database_uri)

        if not success:
            raise RuntimeError('Operation was unsuccessful : ' + cmd)

        try:
            # reinitialize the repository before checking out the commit
            service_do_init_statif(s_visualize_repository, s_url_git)

            logger.info('> Checkout the visualized commit')

            git = sh.git.bake(_cwd=s_visualize_repository, _tty_out=False)

            # checkout the commit on the archive repository
            for line in git.checkout('-q', s_commit, _iter=True):
                logger.info(line)

            logger.info(
                '> Change status for last visualized statification to saved')

            try:
                # change the status of the previous Visualized statification to Saved status
                Statification.switch_status(session, Status.VISUALIZED,
                                            Status.SAVED)
            except (ValueError, NoResultFound) as e:
                # in the case there was no statification that had the status VISUALIZED before, we just catch the error
                # and we continue as normal
                logger.info(str(e))

            logger.info('> Change status for new visualized statification')

            # Now the statification is on the visualize repository so we change the status from default to visualized
            Statification.upd_status(session, s_commit, Status.VISUALIZED)

            # update the date of update of the statification
            Statification.upd_upd_date(session, s_commit, datetime.utcnow())

            # create a StatificationHistoric to keep track of the modification
            Statification.static_add_object_to_statification(
                StatificationHistoric, session, s_commit, datetime.utcnow(),
                s_user, Actions.VISUALIZE_STATIFICATION)
            # on success write a success code
            write_status_background(
                {
                    'success': True,
                    'operation': 'visualize',
                    'commit': s_commit
                }, s_file_status_background)
        except (ValueError, NoResultFound) as e:
            # there is no reason the process execute the code here
            logger.error(str(e))
            write_status_background(
                {
                    'success': False,
                    'error': 'database',
                    'operation': 'visualize'
                }, s_file_status_background)
        except RuntimeError as e:
            logger.error(str(e))
            write_status_background(
                {
                    'success': False,
                    'error': 'subprocess',
                    'operation': 'visualize'
                }, s_file_status_background)
        except sh.ErrorReturnCode as e:
            logger.error(str(e))
            # if an error has happened during a subprocess
            write_status_background(
                {
                    'success': False,
                    'error': 'subprocess',
                    'operation': 'visualize'
                }, s_file_status_background)
        finally:
            unlock_access(s_lock_file)
Example #9
0
    def do_apply_prod_done(cmd: str, success: bool, exit_code: int):
        """
        That method is called after the git.branch process has finish
        :param cmd: the string containing the command that lauch git.push
        :param success: a boolean , true if success false otherwise
        :param exit_code: the exit code that was return by the command
        :raise RuntimeError
        """
        # create a session for this specific code , because it's executed after the flask instance has been killed
        session = open_session_db(s_database_uri)

        if not success:
            raise RuntimeError('Operation was unsuccessful : ' + cmd)

        # Select the git repository
        git = sh.git.bake(_cwd=s_static_repository, _tty_out=False)

        logger.info('> Push the statification to production server(s)')

        for url in urls_git_prod:
            logger.info('> Push statification to server : ' + url)
            # push the commit to production server
            for log in git.push('-f', url, 'production', _iter='out'):
                logger.info(log)

        logger.info(
            '> Change status of the last statification push to prod to saved')

        # catch error if there is some
        try:
            # change the status of the previous put in Production statification to SAVED status
            Statification.switch_status(session, Status.PRODUCTION,
                                        Status.SAVED)
        except (ValueError, NoResultFound) as e:
            # in the case there was no statification that had the status PRODUCTION before, we just catch the error
            # and we continue as normal
            logger.info(str(e))

        logger.info('> Change status of the new statification to push to prod')

        try:
            # change status of the statification pushed to prod to PRODUCTION
            Statification.upd_status(session, s_commit, Status.PRODUCTION)

            # update the date of update of the statification
            Statification.upd_upd_date(session, s_commit, datetime.utcnow())

            # create a StatificationHistoric to keep track of the modification
            Statification.static_add_object_to_statification(
                StatificationHistoric, session, s_commit, datetime.utcnow(),
                s_user, Actions.PUSHTOPROD_STATIFICATION)

            logger.info('> Push to prod operations terminated')
            write_status_background(
                {
                    'success': True,
                    'operation': 'pushtoprod',
                    'commit': s_commit
                }, s_file_status_background)
        except RuntimeError as e:
            logger.error(str(e))
            write_status_background(
                {
                    'success': False,
                    'error': 'subprocess',
                    'operation': 'pushtoprod'
                }, s_file_status_background)
        except (ValueError, NoResultFound) as e:
            # there is no reason the process execute the code here
            logger.error(str(e))
            write_status_background(
                {
                    'success': False,
                    'error': 'database',
                    'operation': 'pushtoprod'
                }, s_file_status_background)
        finally:
            # always unlock the route
            unlock_access(s_lock_file)
Example #10
0
    def commit_done(cmd: str, success: bool, exit_code: int):
        """
        That method is called after the git.push process has finish
        :param cmd: the string containing the command that lauch git.push
        :param success: a boolean , true if success false otherwise
        :param exit_code: the exit code that was return by the command
        :raise RuntimeError
        """
        # create a session for this specific code , because it's executed after the flask instance has been killed
        session = open_session_db(s_database_uri)

        if not success:
            raise RuntimeError('Operation was unsuccessful : ' + cmd)

        logger.info('> Rename log file with commit sha')

        try:
            # rename the logfile of the statification by the commit SHA
            os.rename(s_log_file, s_log_dir + "/" + s_commit + ".log")

            logger.info('> Register Commit into the database')

            # update the current statification with no commit with the new commit sha
            Statification.upd_commit(session, '', s_commit)

            # Now the statification is on git so we change the status from statified to SAVED
            Statification.upd_status(session, s_commit, Status.SAVED)

            # update the date of update of the statification
            Statification.upd_upd_date(session, s_commit, datetime.utcnow())

            # create a StatificationHistoric to keep track of the modification
            Statification.static_add_object_to_statification(
                StatificationHistoric, session, s_commit, datetime.utcnow(),
                s_user, Actions.COMMIT_STATIFICATION)
            logger.info('> Commit operations terminated')

            # on success write a success code and the commit id
            write_status_background(
                {
                    'success': True,
                    'commit': s_commit,
                    'operation': 'commit'
                }, s_file_status_background)
        except RuntimeError as e:
            logger.error(str(e))
            write_status_background(
                {
                    'success': False,
                    'error': 'subprocess',
                    'operation': 'commit'
                }, s_file_status_background)
        except FileNotFoundError as e:
            logger.error(str(e))
            write_status_background(
                {
                    'success': False,
                    'error': 'log_file',
                    'operation': 'commit'
                }, s_file_status_background)
        except (ValueError, NoResultFound) as e:
            logger.error(str(e))
            # if no statification was found for the given commit
            # write an error in the statusBackground file
            write_status_background(
                {
                    'success': False,
                    'error': 'database',
                    'operation': 'commit'
                }, s_file_status_background)
        finally:
            # always unlock the route
            unlock_access(s_lock_file)