def duplicate_content(self, request1, url): """Avoid param duplicate. Compare source codes with params and whitout. Return url whitout params if it's the same content. :param request: request :type request: requests.models.Response :return: url, source code """ url1 = clean_link(request1.url) if url1 is None: return url, request1.text infos_url = urlparse(url1) if infos_url.query != '': new_url = infos_url.scheme + '://' + infos_url.netloc + infos_url.path request2 = self.send_request(new_url) if not isinstance(request2, requests.models.Response): return url1, request1.text request2.encoding = self.search_encoding(request2.headers, request2.text)[0] url2 = clean_link(request2.url) if url2 is None: return url1, request1.text if connexion.duplicate_content(request1.text, request2.text): tell("Same content: " + url1 + " and " + url2) # Tests return url2, request2.text else: return url1, request1.text else: return url1, request1.text
def send_doc(self, webpage_infos): """send documents informations to database. :param infos: informations to send to database :type infos: list :return: True if an error occured """ result, response = self.send_command( "SELECT popularity, last_crawl FROM search WHERE url = %s", (webpage_infos['url'], ), True) if 'error' in response: tell('Popularity and last_crawl query failed: ' + response, 16) return True if result != (): # Url found in database, there is a answer: last_crawl = result[0][1] # datetime.datetime object if (datetime.now() - last_crawl) > CRAWL_DELAY: # The program already crawled this website error = self.update(webpage_infos, result[0][0] + 1) if error: return True else: tell('Recently crawled: ' + webpage_infos['url']) else: # Url not found in database, the url don't exists in the database, we add it: error = self.insert(webpage_infos) if error: return True return False # All is correct
def search_encoding(self, headers, code): """Searche encoding of webpage in source code. If an encoding is found in source code, score is 1, but if not score is 0 and encoding is utf-8. :param headers: hearders of requests :type headers: dict :param code: source code :type code: str :return: encoding of webpage and it score """ # Search in headers: headers = str(headers).lower() charset = headers.find('charset') end_charset = headers.find('\'', charset) if charset != -1 and end_charset != -1: return headers[charset + 8:end_charset], 1 else: # Search in source code: self.parser_encoding.feed(code) if self.parser_encoding.encoding != '': return self.parser_encoding.encoding, 1 else: tell('No encoding', 9, severity=0) return 'utf-8', 0
def https_duplicate(self, old_url): """Avoid https and http duplicate. If old url is secure (https), must delete insecure url if exists, then return secure url (old url). If old url is insecure (http), must delete it if secure url exists, then return secure url (new url) :param old_url: old url :type old_url: str :return: url to add and url to delete """ tell('url to send: ' + old_url, severity=-1) new_url = database.convert_secure(old_url) new_exists = self.doc_exists(new_url) if database.url_is_secure(old_url): # old_url start with https if new_exists: # Start with http return old_url, new_url else: return old_url, None else: # old_url is insecure, start with http if new_exists: # Secure url exists if self.doc_exists(old_url): # Insecure exists return new_url, old_url else: return new_url, None else: return old_url, None
def tell_progress(self, upload=True): message = 'Uploading' if upload else 'Downloading' if self.nb_files != 0: percent = round(self.downuploaded_files * 100 / self.nb_files, 2) message += ' {}% ({}/{})'.format(percent, self.downuploaded_files, self.nb_files) tell(message) else: tell('No progress data')
def download_lists_words(self): """Download stopwords and badwords.""" tell('download list of words') self.connexion() for filename in [ 'en.stopwords.txt', 'fr.stopwords.txt', 'en.badwords.txt', 'fr.badwords.txt' ]: type_ = filename[3:-4] + '/' self.cd('/var/www/html/data/' + type_) self.get(DIR_DATA + type_ + filename, filename) self.disconnect()
def save_inverted_index(self, inverted_index): """Save inverted-index in local. Save it in a .json file when can't send. :param inverted_index: inverted-index :type inverted_index: dict """ tell('Save inverted-index in save file') with open(FILE_INDEX, 'w') as myfile: json.dump(inverted_index, myfile, ensure_ascii=False)
def ckeck_size_links(self, links): """Check number of links in file. :param links: links saved in file :type links: str """ if len(links) > self.max_links: # Check the size self.writing_file_number += 1 tell('More than {0} links : {1} : writing file {2}.'.format( str(self.max_links), str(len(links)), str(self.writing_file_number)), severity=-1)
def get_inverted_index(self): """Get inverted-index in local. Call after a connxion error. Read a .json file conatin inverted-index. Delete this file after reading. :return: inverted-index """ tell('Get inverted-index from save file') with open(FILE_INDEX, 'r') as myfile: inverted_index = json.load(myfile) remove(FILE_INDEX) return convert_keys(inverted_index)
def indexing(self): """Index crawled webpages. get id of each documents and index them. """ module.tell('Indexing', severity=2) for webpage_infos in self.infos: doc_id = self.database.get_doc_id(webpage_infos['url']) if doc_id is None: self.safe_quit() module.tell('Indexing {0} {1}'.format(doc_id, webpage_infos['url'])) self.index_manager.add_doc(webpage_infos['keywords'], doc_id, webpage_infos['language'])
def no_connexion(url='https://github.com'): """Check connexion. Try to connect to swiftea website. :param url: url use by test :return: True if no connexion """ try: requests.get(url) except requests.exceptions.RequestException: tell('No connexion') return True else: return False
def get_doc_id(self, url, table='search'): """Get id of a document in database. :param url: url of webpage :type url: str :param table: table, default to search :type table: str :return: id of webpage or None if not found """ result, response = self.send_command( "SELECT id FROM {} WHERE url = %s".format(table), (url, )) if 'error' in response[1]: tell('Failed to get id: ' + response, 11) return None else: return str(result[0])
def send_to_db(self): """Send all informations about crawled webpages to database. Can delete some documents to avoid http and https duplicates. """ module.tell('Send to database', severity=2) for webpage_infos in self.infos: webpage_infos['url'], url_to_del = self.database.https_duplicate( webpage_infos['url']) if url_to_del: self.delete_if_exists(url_to_del) module.tell('New url (to add): ' + webpage_infos['url'], severity=-1) error = self.database.send_doc(webpage_infos) if error: self.safe_quit()
def suggestions(self): """Get the five first url from Suggestions table and delete them. :return: list of url in Suggestions table and delete them """ result, response = self.send_command( "SELECT url FROM suggestions LIMIT 5", fetchall=True) if 'error' in response[1]: tell('Failed to get url: ' + response, 13) return None else: suggested_links = list() for element in result: if len(suggested_links) < 5: suggested_links.append(element[0]) self.del_one_doc(element[0], 'suggestions') return suggested_links
def del_one_doc(self, url, table='search'): """Delete document corresponding to url from the given table. :param url: url of webpage :type url: str :param table: table where given url is :type table: str :param table: table, default to search :type table: str :return: status message """ tell('Delete from {} doc: '.format(table) + url) response = self.send_command( "DELETE FROM {} WHERE url = %s".format(table), (url, )) if 'error' in response[1]: tell('Doc not removed: {0}, {1}'.format(url, response[1]), 12) return response[1]
def check_size_files(self): for filelog in [FILE_EVENTS, FILE_ERRORS]: filearchive = filelog[:-3] + 'zip' with open(filelog, 'r') as myfile: content = myfile.readlines() if len(content) > MAX_SIZE: if not path.exists(filearchive): ZipFile(file=filearchive, mode='w').close() filename = '0' else: with ZipFile(filearchive, 'r') as myzip: filename = str(int(myzip.namelist()[-1]) + 1) # The last one +1 rename(filelog, filename) with ZipFile(filearchive, 'a') as myzip: myzip.write(filename) remove(filename) tell('Archiving ' + filelog + ': ' + filename, severity=-1)
def insert(self, infos): """Insert a new document in database. :param infos: doc infos :type infos: dict() :return: True is an arror occured """ tell('Adding ' + infos['url']) response = self.send_command( """INSERT INTO search (title, description, url, first_crawl, last_crawl, language, likes, popularity, score, homepage, sanesearch, favicon) VALUES (%s, %s, %s, NOW(), NOW(), %s, 0, 1, %s, %s, %s, %s)""", \ (infos['title'], infos['description'], infos['url'], infos['language'], infos['score'], infos['homepage'], infos['sanesearch'], infos['favicon'])) if 'error' in response[1]: tell('Failed to add: ' + response[1], 10) return True else: return False
def start(self): """Start main loop of crawling. Crawl 10 webpages, send documents to database, index them and save the configurations (line number in links file, ...). Send the inverted-index and check for suggestions each 500 crawled webpages. Do it until the user want stop crawling or occured an error. """ run = True while run: stats_send_index = time() self.suggestions() for _ in range(50): module.tell('Crawl', severity=2) begining = time() while len(self.infos) < 10: module.tell('File {0}, line {1}'.format( str(self.file_manager.reading_file_number), str(self.file_manager.reading_line_number + 1)), severity=0) url = self.file_manager.get_url( ) # Get the url of the website if url == 'stop': self.safe_quit() self.crawl_webpage(url) # End of crawling loop module.tell('{} new documents!'.format(self.crawled_websites), severity=-1) self.send_to_db() self.indexing() module.stats_webpages(begining, time()) self.infos.clear( ) # Reset the list of dict of informations of websites. self.file_manager.check_stop_crawling() self.file_manager.save_config() if self.file_manager.run == 'false': module.tell('User wants stop program') self.safe_quit() run = False break # End of loop range(n) if run: self.suggestions() self.send_inverted_index() self.file_manager.check_size_files() module.stats_send_index(stats_send_index, time())
def doc_exists(self, url, table='search'): """Check if url is in database. :param url: url corresponding to doc :type url: str :param table: table, default to search :type table: str :return: True if doc exists """ result, response = self.send_command( "SELECT EXISTS(SELECT * FROM {} WHERE url=%s)".format(table), (url, )) if 'error' in response: tell('Failed to check row: ' + response, 14) return None if result[0] == 1: return True else: return False
def send_inverted_index(self, inverted_index): """Send inverted-index. :param inverted_index: inverted-index to send :type inverted_index: dict :return: True if an error occured """ tell('send inverted-index') self.downuploaded_files = 0 self.nb_files = count_files_index( inverted_index) # Count files from index (prepare to upload) self.connexion() self.cd(self.sftp_index) for language in inverted_index: list_language = self.listdir() if language not in list_language: self.mkdir(language) if not path.isdir(DIR_INDEX + language): mkdir(DIR_INDEX + language) self.cd(language) for first_letter in inverted_index[language]: self.tell_progress() list_first_letter = self.listdir() if first_letter not in list_first_letter: self.mkdir(first_letter) if not path.isdir(DIR_INDEX + language + '/' + first_letter): mkdir(DIR_INDEX + language + '/' + first_letter) self.cd(first_letter) for two_letters in inverted_index[language][first_letter]: index = inverted_index[language][first_letter][two_letters] self.upload(language, first_letter, two_letters, index) self.cd('..') self.cd('..') self.disconnect() tell('Transfer complete', severity=0) return False
def update(self, infos, popularity): """Update a document in database. :param infos: doc infos :type infos: dict() :param popularity: new doc popularity :type popularity: int :return: True is an arror occured """ tell('Updating ' + infos['url']) response = self.send_command( """UPDATE search SET title=%s, description=%s, last_crawl=NOW(), language=%s, popularity=%s, score=%s, homepage=%s, sanesearch=%s, favicon=%s WHERE url = %s """, (infos['title'], infos['description'], infos['language'], popularity, infos['score'],\ infos['homepage'], infos['sanesearch'], infos['favicon'], infos['url'])) if 'error' in response[1]: tell('Failed to update: ' + response[1], 9) return True else: return False
def get_url(self): """Get url of next webpage. Check the size of curent reading links and increment it if over. :return: url of webpage to crawl """ filename = DIR_LINKS + str(self.reading_file_number) try: with open(filename, 'r', errors='replace', encoding='utf8') as myfile: list_links = myfile.read().splitlines() # List of urls except FileNotFoundError: tell('Reading file is not found in get_url: ' + filename, 4) return 'stop' else: url = list_links[self.reading_line_number] self.reading_line_number += 1 # If it is the last links of the file: if len(list_links) == (self.reading_line_number): self.reading_line_number = 0 if self.reading_file_number != 0: remove(filename) tell('File ' + filename + ' removed', severity=-1) self.reading_file_number += 1 # The program have read all the links: next reading_file_number tell('Next reading file: ' + str(self.reading_file_number), severity=-1) return url
def sane_search(self, keywords, language, max_ratio=.2): """Filter adults websites. :param: keywords: webpage's keywords :type keywords: list :pram language: found website language :type language: str :return: True or False """ badwords = self.BADWORDS[language] nb_badwords = 0 nb_words = len(keywords) for keyword in keywords: if keyword in badwords: nb_badwords += 1 ratio = nb_badwords / nb_words if ratio >= max_ratio: tell('bad site detected') return True else: return False
def crawl_webpage(self, url): """Crawl the given url. Get webpage source code, feed it to the parser, manager extracting data, manager redirections and can delete some documents to avoid duplicates. :param url: url of webpage :type url: str """ module.tell('Crawling ' + url) # Get webpage's html code: new_url, html_code, nofollow, score, all_urls = self.web_connexion.get_code( url) if html_code is None: self.delete_if_exists( all_urls) # Failed to get code, must delete from database. elif html_code == 'no connexion': sys.exit() elif html_code == 'ignore': # There was something wrong and maybe a redirection. self.delete_if_exists(all_urls) else: module.tell('New url: ' + new_url, severity=0) self.delete_if_exists(all_urls) # Except new url webpage_infos, links = self.site_informations.get_infos( new_url, html_code, nofollow, score) webpage_infos['url'] = new_url if webpage_infos['title'] != '': if module.can_add_doc( self.infos, webpage_infos): # Duplicate only with url self.infos.append(webpage_infos) self.crawled_websites += 1 links = self.file_manager.save_links(links) self.file_manager.ckeck_size_links(links) else: self.delete_if_exists(new_url)
def read_inverted_index(self): """Get inverted-index in local. Call after sending inverted-index without error. Read all files created for sending inverted-index. :return: inverted-index """ tell('Get inverted-index in local') inverted_index = dict() for language in listdir(DIR_INDEX): inverted_index[language] = dict() for first_letter in listdir(DIR_INDEX + language): inverted_index[language][first_letter] = dict() for filename in listdir(DIR_INDEX + language + '/' + first_letter): with open(DIR_INDEX + language + '/' + first_letter + '/' + filename, 'r', encoding='utf-8') as myfile: inverted_index[language][first_letter][ filename[:-4]] = json.load(myfile) return convert_keys(inverted_index)
def delete_if_exists(self, urls): """Delete bad doc if exists. Check if doc exists in database and delete it from database and inverted-index. :param url: url to delete :type url: str or list """ if isinstance(urls, str): urls = [urls] for url in urls: doc_exists = self.database.doc_exists(url) if doc_exists: doc_id = self.database.get_doc_id(url) if doc_id: self.database.del_one_doc(url) self.index_manager.delete_doc_id(doc_id) else: self.safe_quit() elif doc_exists is None: self.safe_quit() else: module.tell('Ignore: ' + url, severity=-1)
def get_code(self, url): """Get source code of given url. :param url: url of webpage :type url: str :return: source code, True if no take links, score and new url (redirection) """ nofollow, url = connexion.is_nofollow(url) result = self.send_request(url) if not isinstance(result, requests.models.Response): return None, result, None, None, url else: request = result del result allowed = self.check_robots_perm(url) if request.status_code == requests.codes.ok and request.headers.get( 'Content-Type', '').startswith('text/html') and allowed: # Search encoding of webpage: request.encoding, score = self.search_encoding( request.headers, request.text) new_url, code = self.duplicate_content( request, url) # new_url is clean and maybe without params all_urls = connexion.all_urls( request) # List of urls to delete if new_url in all_urls: # new_url don't be delete all_urls.remove(new_url) return new_url, code, nofollow, score, all_urls else: tell('Webpage infos: status code=' + str(request.status_code) + ', Content-Type=' + \ request.headers.get('Content-Type', '') + ', robots perm=' + str(allowed), severity=0) # All redirections urls, the first and the last: all_urls = connexion.all_urls(request) all_urls.append(request.url) all_urls.append(url) return None, 'ignore', None, None, remove_duplicates(all_urls)
def check_robots_perm(self, url): """Check robots.txt for permission. :param url: webpage url :type url: str :return: True if can crawl """ try: allowed = self.reqrobots.allowed(url, USER_AGENT) except ServerError as error: tell('Error robots.txt (reppy): ' + str(error) + ' ' + url, 6) allowed = True except requests.exceptions.Timeout: tell('Error robots.txt (timeout): ' + url) allowed = True except requests.exceptions.RequestException as error: tell('Error robots.txt (requests): ' + str(error) + ' ' + url, 7) allowed = True except Exception as error: tell('Unknow robots.txt error: ' + str(error) + ' ' + url, 8) allowed = True return allowed
def send_request(self, url): try: request = requests.get(url, headers=HEADERS, timeout=TIMEOUT) except requests.packages.urllib3.exceptions.ReadTimeoutError: tell('Read timeout error (urllib3): ' + url, 3) return None except requests.exceptions.Timeout: tell('Timeout error: ' + url, 4) return None except requests.exceptions.RequestException as error: tell('Connexion failed: {}, {}'.format(str(error), url), 5) if connexion.no_connexion(): return 'no connexion' else: return None else: return request
def get_inverted_index(self): """Get inverted-index. :return: inverted-index and True if an error occured """ tell('Get inverted-index from server') self.downuploaded_files = 0 inverted_index = dict() self.connexion() self.cd(self.sftp_index) self.nb_files = self.countfiles( ) # Count files on server (prepare to download) list_language = self.listdir() for language in list_language: self.cd(language) if not path.isdir(DIR_INDEX + language): mkdir(DIR_INDEX + language) inverted_index[language] = dict() list_first_letter = self.listdir() for first_letter in list_first_letter: self.tell_progress(False) self.cd(first_letter) if not path.isdir(DIR_INDEX + language + '/' + first_letter): mkdir(DIR_INDEX + language + '/' + first_letter) inverted_index[language][first_letter] = dict() list_filename = self.listdir() for filename in list_filename: inverted_index[language][first_letter][ filename[:-4]] = self.download(language, first_letter, filename) self.cd('..') self.cd('..') self.disconnect() if inverted_index == dict(): tell('No inverted-index on server', severity=0) else: tell('Transfer complete', severity=0) return inverted_index