Beispiel #1
0
    def duplicate_content(self, request1, url):
        """Avoid param duplicate.

		Compare source codes with params and whitout.
		Return url whitout params if it's the same content.

		:param request: request
		:type request: requests.models.Response
		:return: url, source code

		"""
        url1 = clean_link(request1.url)
        if url1 is None:
            return url, request1.text
        infos_url = urlparse(url1)
        if infos_url.query != '':
            new_url = infos_url.scheme + '://' + infos_url.netloc + infos_url.path
            request2 = self.send_request(new_url)
            if not isinstance(request2, requests.models.Response):
                return url1, request1.text
            request2.encoding = self.search_encoding(request2.headers,
                                                     request2.text)[0]
            url2 = clean_link(request2.url)
            if url2 is None:
                return url1, request1.text
            if connexion.duplicate_content(request1.text, request2.text):
                tell("Same content: " + url1 + " and " + url2)  # Tests
                return url2, request2.text
            else:
                return url1, request1.text
        else:
            return url1, request1.text
Beispiel #2
0
    def send_doc(self, webpage_infos):
        """send documents informations to database.

		:param infos: informations to send to database
		:type infos: list
		:return: True if an error occured

		"""
        result, response = self.send_command(
            "SELECT popularity, last_crawl FROM search WHERE url = %s",
            (webpage_infos['url'], ), True)
        if 'error' in response:
            tell('Popularity and last_crawl query failed: ' + response, 16)
            return True
        if result != ():
            # Url found in database, there is a answer:
            last_crawl = result[0][1]  # datetime.datetime object
            if (datetime.now() - last_crawl) > CRAWL_DELAY:
                # The program already crawled this website
                error = self.update(webpage_infos, result[0][0] + 1)
                if error:
                    return True
            else:
                tell('Recently crawled: ' + webpage_infos['url'])
        else:
            # Url not found in database, the url don't exists in the database, we add it:
            error = self.insert(webpage_infos)
            if error:
                return True
        return False  # All is correct
Beispiel #3
0
    def search_encoding(self, headers, code):
        """Searche encoding of webpage in source code.

		If an encoding is found in source code, score is 1, but if not
		score is 0 and encoding is utf-8.

		:param headers: hearders of requests
		:type headers: dict
		:param code: source code
		:type code: str
		:return: encoding of webpage and it score

		"""
        # Search in headers:
        headers = str(headers).lower()
        charset = headers.find('charset')
        end_charset = headers.find('\'', charset)
        if charset != -1 and end_charset != -1:
            return headers[charset + 8:end_charset], 1
        else:
            # Search in source code:
            self.parser_encoding.feed(code)
            if self.parser_encoding.encoding != '':
                return self.parser_encoding.encoding, 1
            else:
                tell('No encoding', 9, severity=0)
                return 'utf-8', 0
Beispiel #4
0
    def https_duplicate(self, old_url):
        """Avoid https and http duplicate.

		If old url is secure (https), must delete insecure url if exists,
		then return secure url (old url).
		If old url is insecure (http), must delete it if secure url exists,
		then return secure url (new url)

		:param old_url: old url
		:type old_url: str
		:return: url to add and url to delete

		"""
        tell('url to send: ' + old_url, severity=-1)
        new_url = database.convert_secure(old_url)
        new_exists = self.doc_exists(new_url)

        if database.url_is_secure(old_url):
            # old_url start with https
            if new_exists:  # Start with http
                return old_url, new_url
            else:
                return old_url, None
        else:
            # old_url is insecure, start with http
            if new_exists:  # Secure url exists
                if self.doc_exists(old_url):  # Insecure exists
                    return new_url, old_url
                else:
                    return new_url, None
            else:
                return old_url, None
Beispiel #5
0
 def tell_progress(self, upload=True):
     message = 'Uploading' if upload else 'Downloading'
     if self.nb_files != 0:
         percent = round(self.downuploaded_files * 100 / self.nb_files, 2)
         message += ' {}% ({}/{})'.format(percent, self.downuploaded_files,
                                          self.nb_files)
         tell(message)
     else:
         tell('No progress data')
Beispiel #6
0
 def download_lists_words(self):
     """Download stopwords and badwords."""
     tell('download list of words')
     self.connexion()
     for filename in [
             'en.stopwords.txt', 'fr.stopwords.txt', 'en.badwords.txt',
             'fr.badwords.txt'
     ]:
         type_ = filename[3:-4] + '/'
         self.cd('/var/www/html/data/' + type_)
         self.get(DIR_DATA + type_ + filename, filename)
     self.disconnect()
Beispiel #7
0
    def save_inverted_index(self, inverted_index):
        """Save inverted-index in local.

		Save it in a .json file when can't send.

		:param inverted_index: inverted-index
		:type inverted_index: dict

		"""
        tell('Save inverted-index in save file')
        with open(FILE_INDEX, 'w') as myfile:
            json.dump(inverted_index, myfile, ensure_ascii=False)
Beispiel #8
0
    def ckeck_size_links(self, links):
        """Check number of links in file.

		:param links: links saved in file
		:type links: str

		"""
        if len(links) > self.max_links:  # Check the size
            self.writing_file_number += 1
            tell('More than {0} links : {1} : writing file {2}.'.format(
                str(self.max_links), str(len(links)),
                str(self.writing_file_number)),
                 severity=-1)
Beispiel #9
0
    def get_inverted_index(self):
        """Get inverted-index in local.

		Call after a connxion error. Read a .json file conatin inverted-index.
		Delete this file after reading.

		:return: inverted-index

		"""
        tell('Get inverted-index from save file')
        with open(FILE_INDEX, 'r') as myfile:
            inverted_index = json.load(myfile)
        remove(FILE_INDEX)
        return convert_keys(inverted_index)
Beispiel #10
0
    def indexing(self):
        """Index crawled webpages.

		get id of each documents and index them.

		"""
        module.tell('Indexing', severity=2)
        for webpage_infos in self.infos:
            doc_id = self.database.get_doc_id(webpage_infos['url'])
            if doc_id is None:
                self.safe_quit()
            module.tell('Indexing {0} {1}'.format(doc_id,
                                                  webpage_infos['url']))
            self.index_manager.add_doc(webpage_infos['keywords'], doc_id,
                                       webpage_infos['language'])
Beispiel #11
0
def no_connexion(url='https://github.com'):
	"""Check connexion.

	Try to connect to swiftea website.

	:param url: url use by test
	:return: True if no connexion

	"""
	try:
		requests.get(url)
	except requests.exceptions.RequestException:
		tell('No connexion')
		return True
	else:
		return False
Beispiel #12
0
    def get_doc_id(self, url, table='search'):
        """Get id of a document in database.

		:param url: url of webpage
		:type url: str
		:param table: table, default to search
		:type table: str
		:return: id of webpage or None if not found

		"""
        result, response = self.send_command(
            "SELECT id FROM {} WHERE url = %s".format(table), (url, ))
        if 'error' in response[1]:
            tell('Failed to get id: ' + response, 11)
            return None
        else:
            return str(result[0])
Beispiel #13
0
    def send_to_db(self):
        """Send all informations about crawled webpages to database.

		Can delete some documents to avoid http and https duplicates.

		"""
        module.tell('Send to database', severity=2)
        for webpage_infos in self.infos:
            webpage_infos['url'], url_to_del = self.database.https_duplicate(
                webpage_infos['url'])
            if url_to_del:
                self.delete_if_exists(url_to_del)
            module.tell('New url (to add): ' + webpage_infos['url'],
                        severity=-1)
            error = self.database.send_doc(webpage_infos)
            if error:
                self.safe_quit()
Beispiel #14
0
    def suggestions(self):
        """Get the five first url from Suggestions table and delete them.

		:return: list of url in Suggestions table and delete them

		"""
        result, response = self.send_command(
            "SELECT url FROM suggestions LIMIT 5", fetchall=True)
        if 'error' in response[1]:
            tell('Failed to get url: ' + response, 13)
            return None
        else:
            suggested_links = list()
            for element in result:
                if len(suggested_links) < 5:
                    suggested_links.append(element[0])
                    self.del_one_doc(element[0], 'suggestions')
            return suggested_links
Beispiel #15
0
    def del_one_doc(self, url, table='search'):
        """Delete document corresponding to url from the given table.

		:param url: url of webpage
		:type url: str
		:param table: table where given url is
		:type table: str
		:param table: table, default to search
		:type table: str
		:return: status message

		"""
        tell('Delete from {} doc: '.format(table) + url)
        response = self.send_command(
            "DELETE FROM {} WHERE url = %s".format(table), (url, ))
        if 'error' in response[1]:
            tell('Doc not removed: {0}, {1}'.format(url, response[1]), 12)
        return response[1]
Beispiel #16
0
 def check_size_files(self):
     for filelog in [FILE_EVENTS, FILE_ERRORS]:
         filearchive = filelog[:-3] + 'zip'
         with open(filelog, 'r') as myfile:
             content = myfile.readlines()
         if len(content) > MAX_SIZE:
             if not path.exists(filearchive):
                 ZipFile(file=filearchive, mode='w').close()
                 filename = '0'
             else:
                 with ZipFile(filearchive, 'r') as myzip:
                     filename = str(int(myzip.namelist()[-1]) +
                                    1)  # The last one +1
             rename(filelog, filename)
             with ZipFile(filearchive, 'a') as myzip:
                 myzip.write(filename)
             remove(filename)
             tell('Archiving ' + filelog + ': ' + filename, severity=-1)
Beispiel #17
0
    def insert(self, infos):
        """Insert a new document in database.

		:param infos: doc infos
		:type infos: dict()
		:return: True is an arror occured

		"""
        tell('Adding ' + infos['url'])
        response = self.send_command(
        """INSERT INTO search (title, description, url, first_crawl, last_crawl, language, likes, popularity, score, homepage, sanesearch, favicon)
VALUES (%s, %s, %s, NOW(), NOW(), %s, 0, 1, %s, %s, %s, %s)""", \
      (infos['title'], infos['description'], infos['url'], infos['language'], infos['score'], infos['homepage'], infos['sanesearch'], infos['favicon']))
        if 'error' in response[1]:
            tell('Failed to add: ' + response[1], 10)
            return True
        else:
            return False
Beispiel #18
0
    def start(self):
        """Start main loop of crawling.

		Crawl 10 webpages, send documents to database, index them
		and save the configurations (line number in links file, ...).
		Send the inverted-index and check for suggestions each 500 crawled webpages.

		Do it until the user want stop crawling or occured an error.

		"""
        run = True
        while run:
            stats_send_index = time()
            self.suggestions()
            for _ in range(50):
                module.tell('Crawl', severity=2)
                begining = time()
                while len(self.infos) < 10:
                    module.tell('File {0}, line {1}'.format(
                        str(self.file_manager.reading_file_number),
                        str(self.file_manager.reading_line_number + 1)),
                                severity=0)
                    url = self.file_manager.get_url(
                    )  # Get the url of the website
                    if url == 'stop':
                        self.safe_quit()
                    self.crawl_webpage(url)

                # End of crawling loop

                module.tell('{} new documents!'.format(self.crawled_websites),
                            severity=-1)

                self.send_to_db()
                self.indexing()

                module.stats_webpages(begining, time())

                self.infos.clear(
                )  # Reset the list of dict of informations of websites.
                self.file_manager.check_stop_crawling()
                self.file_manager.save_config()
                if self.file_manager.run == 'false':
                    module.tell('User wants stop program')
                    self.safe_quit()
                    run = False
                    break

            # End of loop range(n)
            if run:
                self.suggestions()
                self.send_inverted_index()
                self.file_manager.check_size_files()
                module.stats_send_index(stats_send_index, time())
Beispiel #19
0
    def doc_exists(self, url, table='search'):
        """Check if url is in database.

		:param url: url corresponding to doc
		:type url: str
		:param table: table, default to search
		:type table: str
		:return: True if doc exists

		"""
        result, response = self.send_command(
            "SELECT EXISTS(SELECT * FROM {} WHERE url=%s)".format(table),
            (url, ))
        if 'error' in response:
            tell('Failed to check row: ' + response, 14)
            return None
        if result[0] == 1:
            return True
        else:
            return False
Beispiel #20
0
    def send_inverted_index(self, inverted_index):
        """Send inverted-index.

		:param inverted_index: inverted-index to send
		:type inverted_index: dict
		:return: True if an error occured

		"""
        tell('send inverted-index')
        self.downuploaded_files = 0
        self.nb_files = count_files_index(
            inverted_index)  # Count files from index (prepare to upload)
        self.connexion()
        self.cd(self.sftp_index)

        for language in inverted_index:
            list_language = self.listdir()
            if language not in list_language:
                self.mkdir(language)
            if not path.isdir(DIR_INDEX + language):
                mkdir(DIR_INDEX + language)
            self.cd(language)
            for first_letter in inverted_index[language]:
                self.tell_progress()
                list_first_letter = self.listdir()
                if first_letter not in list_first_letter:
                    self.mkdir(first_letter)
                if not path.isdir(DIR_INDEX + language + '/' + first_letter):
                    mkdir(DIR_INDEX + language + '/' + first_letter)

                self.cd(first_letter)
                for two_letters in inverted_index[language][first_letter]:
                    index = inverted_index[language][first_letter][two_letters]
                    self.upload(language, first_letter, two_letters, index)

                self.cd('..')
            self.cd('..')

        self.disconnect()
        tell('Transfer complete', severity=0)
        return False
Beispiel #21
0
    def update(self, infos, popularity):
        """Update a document in database.

		:param infos: doc infos
		:type infos: dict()
		:param popularity: new doc popularity
		:type popularity: int
		:return: True is an arror occured

		"""
        tell('Updating ' + infos['url'])
        response = self.send_command(
        """UPDATE search
SET title=%s, description=%s, last_crawl=NOW(), language=%s, popularity=%s, score=%s, homepage=%s, sanesearch=%s, favicon=%s
WHERE url = %s """, (infos['title'], infos['description'], infos['language'], popularity, infos['score'],\
       infos['homepage'], infos['sanesearch'], infos['favicon'], infos['url']))
        if 'error' in response[1]:
            tell('Failed to update: ' + response[1], 9)
            return True
        else:
            return False
Beispiel #22
0
    def get_url(self):
        """Get url of next webpage.

		Check the size of curent reading links and increment it if over.

		:return: url of webpage to crawl

		"""
        filename = DIR_LINKS + str(self.reading_file_number)
        try:
            with open(filename, 'r', errors='replace',
                      encoding='utf8') as myfile:
                list_links = myfile.read().splitlines()  # List of urls
        except FileNotFoundError:
            tell('Reading file is not found in get_url: ' + filename, 4)
            return 'stop'
        else:
            url = list_links[self.reading_line_number]
            self.reading_line_number += 1
            # If it is the last links of the file:
            if len(list_links) == (self.reading_line_number):
                self.reading_line_number = 0
                if self.reading_file_number != 0:
                    remove(filename)
                    tell('File ' + filename + ' removed', severity=-1)
                self.reading_file_number += 1
                # The program have read all the links: next reading_file_number
                tell('Next reading file: ' + str(self.reading_file_number),
                     severity=-1)
            return url
    def sane_search(self, keywords, language, max_ratio=.2):
        """Filter adults websites.

		:param: keywords: webpage's keywords
		:type keywords: list
		:pram language: found website language
		:type language: str
		:return: True or False

		"""
        badwords = self.BADWORDS[language]
        nb_badwords = 0
        nb_words = len(keywords)
        for keyword in keywords:
            if keyword in badwords:
                nb_badwords += 1
        ratio = nb_badwords / nb_words
        if ratio >= max_ratio:
            tell('bad site detected')
            return True
        else:
            return False
Beispiel #24
0
    def crawl_webpage(self, url):
        """Crawl the given url.

		Get webpage source code, feed it to the parser, manager extracting data,
		manager redirections and can delete some documents to avoid duplicates.

		:param url: url of webpage
		:type url: str

		"""
        module.tell('Crawling ' + url)
        # Get webpage's html code:
        new_url, html_code, nofollow, score, all_urls = self.web_connexion.get_code(
            url)
        if html_code is None:
            self.delete_if_exists(
                all_urls)  # Failed to get code, must delete from database.
        elif html_code == 'no connexion':
            sys.exit()
        elif html_code == 'ignore':  # There was something wrong and maybe a redirection.
            self.delete_if_exists(all_urls)
        else:
            module.tell('New url: ' + new_url, severity=0)
            self.delete_if_exists(all_urls)  # Except new url
            webpage_infos, links = self.site_informations.get_infos(
                new_url, html_code, nofollow, score)
            webpage_infos['url'] = new_url

            if webpage_infos['title'] != '':
                if module.can_add_doc(
                        self.infos, webpage_infos):  # Duplicate only with url
                    self.infos.append(webpage_infos)
                    self.crawled_websites += 1
                    links = self.file_manager.save_links(links)
                    self.file_manager.ckeck_size_links(links)
            else:
                self.delete_if_exists(new_url)
Beispiel #25
0
    def read_inverted_index(self):
        """Get inverted-index in local.

		Call after sending inverted-index without error.
		Read all files created for sending inverted-index.

		:return: inverted-index

		"""
        tell('Get inverted-index in local')
        inverted_index = dict()
        for language in listdir(DIR_INDEX):
            inverted_index[language] = dict()
            for first_letter in listdir(DIR_INDEX + language):
                inverted_index[language][first_letter] = dict()
                for filename in listdir(DIR_INDEX + language + '/' +
                                        first_letter):
                    with open(DIR_INDEX + language + '/' + first_letter + '/' +
                              filename,
                              'r',
                              encoding='utf-8') as myfile:
                        inverted_index[language][first_letter][
                            filename[:-4]] = json.load(myfile)
        return convert_keys(inverted_index)
Beispiel #26
0
    def delete_if_exists(self, urls):
        """Delete bad doc if exists.

		Check if doc exists in database and delete it from database and inverted-index.

		:param url: url to delete
		:type url: str or list

		"""
        if isinstance(urls, str):
            urls = [urls]
        for url in urls:
            doc_exists = self.database.doc_exists(url)
            if doc_exists:
                doc_id = self.database.get_doc_id(url)
                if doc_id:
                    self.database.del_one_doc(url)
                    self.index_manager.delete_doc_id(doc_id)
                else:
                    self.safe_quit()
            elif doc_exists is None:
                self.safe_quit()
            else:
                module.tell('Ignore: ' + url, severity=-1)
Beispiel #27
0
    def get_code(self, url):
        """Get source code of given url.

		:param url: url of webpage
		:type url: str
		:return: source code, True if no take links, score and new url (redirection)

		"""
        nofollow, url = connexion.is_nofollow(url)
        result = self.send_request(url)
        if not isinstance(result, requests.models.Response):
            return None, result, None, None, url
        else:
            request = result
            del result
            allowed = self.check_robots_perm(url)
            if request.status_code == requests.codes.ok and request.headers.get(
                    'Content-Type', '').startswith('text/html') and allowed:
                # Search encoding of webpage:
                request.encoding, score = self.search_encoding(
                    request.headers, request.text)
                new_url, code = self.duplicate_content(
                    request, url)  # new_url is clean and maybe without params
                all_urls = connexion.all_urls(
                    request)  # List of urls to delete
                if new_url in all_urls:  # new_url don't be delete
                    all_urls.remove(new_url)
                return new_url, code, nofollow, score, all_urls
            else:
                tell('Webpage infos: status code=' + str(request.status_code) + ', Content-Type=' + \
                 request.headers.get('Content-Type', '') + ', robots perm=' + str(allowed), severity=0)
                # All redirections urls, the first and the last:
                all_urls = connexion.all_urls(request)
                all_urls.append(request.url)
                all_urls.append(url)
                return None, 'ignore', None, None, remove_duplicates(all_urls)
Beispiel #28
0
    def check_robots_perm(self, url):
        """Check robots.txt for permission.

		:param url: webpage url
		:type url: str
		:return: True if can crawl

		"""
        try:
            allowed = self.reqrobots.allowed(url, USER_AGENT)
        except ServerError as error:
            tell('Error robots.txt (reppy): ' + str(error) + ' ' + url, 6)
            allowed = True
        except requests.exceptions.Timeout:
            tell('Error robots.txt (timeout): ' + url)
            allowed = True
        except requests.exceptions.RequestException as error:
            tell('Error robots.txt (requests): ' + str(error) + ' ' + url, 7)
            allowed = True
        except Exception as error:
            tell('Unknow robots.txt error: ' + str(error) + ' ' + url, 8)
            allowed = True
        return allowed
Beispiel #29
0
 def send_request(self, url):
     try:
         request = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
     except requests.packages.urllib3.exceptions.ReadTimeoutError:
         tell('Read timeout error (urllib3): ' + url, 3)
         return None
     except requests.exceptions.Timeout:
         tell('Timeout error: ' + url, 4)
         return None
     except requests.exceptions.RequestException as error:
         tell('Connexion failed: {}, {}'.format(str(error), url), 5)
         if connexion.no_connexion():
             return 'no connexion'
         else:
             return None
     else:
         return request
Beispiel #30
0
    def get_inverted_index(self):
        """Get inverted-index.

		:return: inverted-index and True if an error occured

		"""
        tell('Get inverted-index from server')
        self.downuploaded_files = 0
        inverted_index = dict()
        self.connexion()
        self.cd(self.sftp_index)
        self.nb_files = self.countfiles(
        )  # Count files on server (prepare to download)
        list_language = self.listdir()

        for language in list_language:
            self.cd(language)
            if not path.isdir(DIR_INDEX + language):
                mkdir(DIR_INDEX + language)
            inverted_index[language] = dict()
            list_first_letter = self.listdir()
            for first_letter in list_first_letter:
                self.tell_progress(False)
                self.cd(first_letter)
                if not path.isdir(DIR_INDEX + language + '/' + first_letter):
                    mkdir(DIR_INDEX + language + '/' + first_letter)
                inverted_index[language][first_letter] = dict()
                list_filename = self.listdir()
                for filename in list_filename:
                    inverted_index[language][first_letter][
                        filename[:-4]] = self.download(language, first_letter,
                                                       filename)

                self.cd('..')
            self.cd('..')

        self.disconnect()
        if inverted_index == dict():
            tell('No inverted-index on server', severity=0)
        else:
            tell('Transfer complete', severity=0)
        return inverted_index