def https_duplicate(self, old_url): """Avoid https and http duplicate. If old url is secure (https), must delete insecure url if exists, then return secure url (old url). If old url is insecure (http), must delete it if secure url exists, then return secure url (new url) :param old_url: old url :type old_url: str :return: url to add and url to delete """ tell('url to send: ' + old_url, severity=-1) new_url = database.convert_secure(old_url) new_exists = self.doc_exists(new_url) if database.url_is_secure(old_url): # old_url start with https if new_exists: # Start with http return old_url, new_url return old_url, None # old_url is insecure, start with http if new_exists: # Secure url exists if self.doc_exists(old_url): # Insecure exists return new_url, old_url return new_url, None return old_url, None
def send_doc(self, webpage_infos): """Send document informations to database. :param infos: informations to send to database :type infos: list :return: True if an error occured """ error = False # no error response = self.connection() result, response = self.send_command( "SELECT popularity, last_crawl, domain FROM {} WHERE url = %s". format(self.t[0]), (webpage_infos['url']), True) if 'error' in response: tell('Popularity and last_crawl query failed: ' + response) error = True if result != (): # Url found in database, there is an answer: last_crawl = result[0][1] # datetime.datetime object if (datetime.now() - last_crawl) > CRAWL_DELAY: # The program have already crawled this website error = self.update(webpage_infos, result) else: tell('Recently crawled: ' + webpage_infos['url']) else: # Url not found in database, the url doesn't exist in the database, # we add it: error = self.insert(webpage_infos) self.close_connection() return error # All is correct
def read_links(self): """Get url of next webpage. Check the size of curent reading links and increment it if over. :return: url of webpage to crawl """ self.domains = swiftea_bot_links.get_domains() filename_ptr, reading_line_number = swiftea_bot_links.get_filename_read( self.domains, self.crawl_option ) filename = data.DIR_LINKS + str(filename_ptr) if path.exists(filename): with open(filename, 'r', errors='replace', encoding='utf8') as myfile: list_links = myfile.read().splitlines() # List of urls else: tell('Reading file not found in get_url: ' + filename, 4) return 'error' # If it's the last links of the file: if len(list_links) == reading_line_number-1: self.domains[filename_ptr]['completed'] = 1 return '#level_complete#' tell('File {0}, line {1}'.format( str(filename_ptr), str(reading_line_number)), severity=0) url = list_links[reading_line_number-1] return url
def tell_progress(self, upload=True): message = 'Uploading' if upload else 'Downloading' if self.nb_files != 0: percent = round(self.downuploaded_files * 100 / self.nb_files, 2) message += ' {}% ({}/{})'.format(percent, self.downuploaded_files, self.nb_files) tell(message) else: tell('No progress data')
def download_lists_words(self): """Download stopwords and badwords.""" tell('download list of words') self.connection() for filename in ['en.stopwords.txt', 'fr.stopwords.txt', 'en.badwords.txt', 'fr.badwords.txt']: type_ = filename[3:-4] + '/' self.cd(self.FTP_DATA + type_) self.get(type_ + filename, filename) self.disconnect()
def get_url(self): url = self.read_links() if url == '#level_complete#': tell('Level complete, new level: ' + str(self.crawl_option['level'])) self.crawl_option['level'] += 1 swiftea_bot_links.save_domains(self.domains) if (self.crawl_option['level'] < self.crawl_option['target-level']): return self.read_links() return '#target-reached#' return url
def save_inverted_index_json(self, inverted_index): """Save inverted-index in local. Save it in a json file when we can't send it. :param inverted_index: inverted-index :type inverted_index: dict """ tell('Save inverted-index in save file') with open(data.FILE_INDEX, 'w') as myfile: json.dump(inverted_index, myfile, ensure_ascii=False)
def get_inverted_index(self): """Get inverted-index in local. Called after a connection error. Read a json file that contains the inverted-index. Delete this file after reading it. :return: inverted-index """ tell('Get inverted-index from save file') with open(data.FILE_INDEX, 'r') as myfile: inverted_index = json.load(myfile) remove(data.FILE_INDEX) return convert_keys(inverted_index)
def get_doc_id(self, url): """Get id of a document in database. :param url: url of webpage :type url: str :return: id of webpage or None if not found """ result, response = self.send_command( "SELECT id FROM {} WHERE url = %s".format(self.t[0]), (url)) if 'error' in response[1]: tell('Failed to get id: ' + response) return None return str(result[0])
def save_inverted_index(self, inverted_index): """Save inverted index in `.sif` files.""" tell('Save inverted index in `.sif` files.') for language in inverted_index: if not path.isdir(self.DIR_INDEX + language): mkdir(self.DIR_INDEX + language) for first_letter in inverted_index[language]: if not path.isdir(self.DIR_INDEX + language + '/' + first_letter): mkdir(self.DIR_INDEX + language + '/' + first_letter) for two_letters in inverted_index[language][first_letter]: index = inverted_index[language][first_letter][two_letters] filename = language + '/' + first_letter + '/' + two_letters + '.sif' with open(self.DIR_INDEX + filename, 'w', encoding='utf-8') as myfile: json.dump(index, myfile, ensure_ascii=False)
def indexing(self): """Index crawled webpages. get id of each documents and index them. """ module.tell('Indexing', severity=2) for webpage_infos in self.infos: doc_id = self.database.get_doc_id(webpage_infos['url']) if doc_id is None: module.safe_quit() module.tell('Indexing {0} {1}'.format(doc_id, webpage_infos['url'])) self.index_manager.add_doc(webpage_infos['keywords'], doc_id, webpage_infos['language'])
def check_connection(url='https://github.com'): """Test internet connection. Try to connect to a website. :param url: url used to test the connection :return: True if connected to internet """ try: requests.get(url) except requests.exceptions.RequestException: tell('No connection') return False else: return True
def del_one_doc(self, url, table=None): """Delete document corresponding to url. :param url: url of webpage :type url: str :return: status message """ if table is None: table = self.t[0] tell('Delete from {} doc: {}'.format(table, url)) response = self.send_command( "DELETE FROM {} WHERE url = %s".format(table), (url)) if 'error' in response[1] or response[1][1] != 'Send command: ok': tell('Doc not removed: {0}, {1}'.format(url, response[1])) return response[1]
def test_check_size_files(self): file_manager.FileManager.check_size_files(self) self.max_size_file = 1 module.tell('Simple message') module.tell('Simple message') file_manager.FileManager.check_size_files(self) module.tell('Simple message') module.tell('Simple message') file_manager.FileManager.check_size_files(self)
def doc_exists(self, url, table=None): # TODO: refacto: une get_doc_id """Check if `url` is in database. :param url: url corresponding to doc :type url: str :return: True if doc exists """ if table is None: table = self.t[0] result, response = self.send_command( "SELECT EXISTS(SELECT * FROM {} WHERE url=%s)".format(table), (url)) if 'error' in response: tell('Failed to check row: ' + response) return None return result[0] == 1
def send_to_db(self): """Send all informations about crawled webpages to database. Can delete some documents to avoid http and https duplicates. """ module.tell('Send to database', severity=2) for webpage_infos in self.infos: webpage_infos['url'], url_to_del = self.database.https_duplicate( webpage_infos['url']) if url_to_del: self.delete_bad_url(url_to_del) module.tell('New url (to add): ' + webpage_infos['url'], severity=-1) error = self.database.send_doc(webpage_infos) if error: module.safe_quit()
def read_inverted_index(self): """Get inverted-index in local. Read all files created to send inverted-index. :return: inverted-index """ tell('Get inverted-index in local') inverted_index = dict() for language in listdir(self.DIR_INDEX): inverted_index[language] = dict() for first_letter in listdir(self.DIR_INDEX + language): inverted_index[language][first_letter] = dict() for filename in listdir(self.DIR_INDEX + language + '/' + first_letter): with open(self.DIR_INDEX + language + '/' + first_letter + '/' + filename, 'r', encoding='utf-8') as myfile: inverted_index[language][first_letter][filename[:-4]] = json.load(myfile) return convert_keys(inverted_index)
def suggestions(self): """Get the five first URLs from Suggestion table and delete them. :return: list of url in Suggestion table and delete them """ result, response = self.send_command( "SELECT url FROM suggestion LIMIT 5", fetchall=True) if 'error' in response[1] or result is None: tell('Failed to get url: ' + response) return None suggested_links = list() for element in result: if len(suggested_links) < 5: suggested_links.append(element[0]) self.del_one_doc(element[0], self.t[1]) return suggested_links
def check_size_files(self): for filelog in [data.FILE_EVENTS, data.FILE_ERRORS]: filearchive = filelog[:-3] + 'zip' if not path.exists(filelog): continue with open(filelog, 'r') as myfile: content = myfile.readlines() if len(content) > data.MAX_SIZE: if not path.exists(filearchive): ZipFile(file=filearchive, mode='w').close() filename = '0' else: with ZipFile(filearchive, 'r') as myzip: filename = str(int(myzip.namelist()[-1])+1) # The last one +1 rename(filelog, filename) with ZipFile(filearchive, 'a') as myzip: myzip.write(filename) remove(filename) tell('Archiving ' + filelog + ': ' + filename, severity=-1)
def insert(self, infos): """Insert a new document in database. :param infos: doc infos :type infos: dict() :return: True is an arror occured """ tell('Adding ' + infos['url']) response = self.send_command( """INSERT INTO {} (title, description, url, first_crawl, last_crawl, language, popularity, score, homepage, sanesearch, favicon, domain) VALUES (%s, %s, %s, NOW(), NOW(), %s, 1, %s, %s, %s, %s, %s)""".format( self.t[0]), (infos['title'], infos['description'], infos['url'], infos['language'], infos['score'], infos['homepage'], infos['sanesearch'], infos['favicon'], self.domain)) if 'error' in response[1][1]: tell('Failed to add: ' + str(response)) return True return False
def suggestions(self): """Suggestions: Get 5 urls from database, delete them, crawl them, send all informations about them, index them. """ suggestions = self.database.suggestions() if suggestions is None: module.tell('Failed to get suggestions') else: suggestions = data_processing.clean_links(suggestions) if suggestions: module.tell('Suggestions', severity=2) for url in suggestions: result = self.crawl_webpage(url) # result[0]: webpage_infos ; result[1]: links if result: self.infos.append(result[0]) self.file_manager.save_links(result[1]) self.send_to_db() self.indexing() self.infos.clear( ) # Reset the list of dict of informations of websites. else: module.tell('No suggestions')
def crawl_webpage(self, url): """Crawl the given url. Get webpage source code, feed it to the parser, manager extracting data, manager redirections and can delete some documents to avoid duplicates. :param url: url of webpage :type url: str """ module.tell('Crawling ' + url) # Get webpage's html code: new_url, html_code, nofollow, score, all_urls = self.web_connection.get_code( url) if html_code is None: self.delete_bad_url( all_urls) # Failed to get code, must delete from database. return None if html_code == 'no connection': module.safe_quit() if html_code == 'ignore': # There was something wrong and maybe a redirection. self.delete_bad_url(all_urls) return None module.tell('New url: ' + new_url, severity=0) webpage_infos, list_links = self.site_informations.get_infos( new_url, html_code, nofollow, score) self.delete_bad_url(all_urls, webpage_infos['language']) # Except new url webpage_infos['url'] = new_url if webpage_infos['title'] != '': if module.can_add_doc( self.infos, webpage_infos): # check for duplicate only with url self.crawled_websites += 1 return webpage_infos, list_links return None self.delete_bad_url(new_url, webpage_infos['language']) return None
def sane_search(self, keywords, language, max_ratio=.2): """Filter pages not suitable for a young audience. :param: keywords: webpage's keywords :type keywords: list :pram language: found website language :type language: str :return: True or False """ badwords = self.BADWORDS[language] nb_badwords = 0 nb_words = len(keywords) if nb_words == 0: return False for keyword in keywords: if keyword in badwords: nb_badwords += 1 ratio = nb_badwords / nb_words if ratio >= max_ratio: tell('bad site detected') return True return False
def update(self, infos, result): """Update a document in database. :param infos: doc infos :type infos: dict() :param popularity: new doc popularity :type popularity: int :return: True if an arror occured """ tell('Updating ' + infos['url']) cmd = """ UPDATE {} SET title=%s, description=%s, last_crawl=NOW(), language=%s, popularity=%s, score=%s, homepage=%s, sanesearch=%s, favicon=%s WHERE url = %s""".format(self.t[0]) response = self.send_command( cmd, (infos['title'], infos['description'], infos['language'], result[0][0] + 1, infos['score'], infos['homepage'], infos['sanesearch'], infos['favicon'], infos['url'])) if 'error' in response[1]: tell('Failed to [update: ' + response[1], -2) return True return False
def delete_bad_url(self, urls, language='*'): """Delete bad doc if exists. Check if doc exists in database and delete it from database and inverted-index. :param url: url to delete :type url: str or list """ if isinstance(urls, str): urls = [urls] for url in urls: doc_exists = self.database.doc_exists(url) if doc_exists: doc_id = self.database.get_doc_id(url) if doc_id: self.database.del_one_doc(url) self.index_manager.delete_doc_id(doc_id, language) else: module.safe_quit() elif doc_exists is None: module.safe_quit() else: module.tell('Ignore: ' + url, severity=-1)
def get_inverted_index(self): """Get inverted-index. :return: inverted-index and True if an error occured """ tell('Get inverted-index from server') self.downuploaded_files = 0 inverted_index = dict() self.connection() self.cd(self.FTP_INDEX) self.nb_files = self.countfiles() # Count files on server (prepare to download) list_language = self.listdir() for language in list_language: self.cd(language) if not path.isdir(self.DIR_INDEX + language): mkdir(self.DIR_INDEX + language) inverted_index[language] = dict() list_first_letter = self.listdir() for first_letter in list_first_letter: self.tell_progress(False) self.cd(first_letter) if not path.isdir(self.DIR_INDEX + language + '/' + first_letter): mkdir(self.DIR_INDEX + language + '/' + first_letter) inverted_index[language][first_letter] = dict() list_filename = self.listdir() for filename in list_filename: inverted_index[language][first_letter][filename[:-4]] = self.download(language, first_letter, filename) self.cd('..') self.cd('..') self.disconnect() if inverted_index == dict(): tell('No inverted-index on server', severity=0) else: tell('Transfer complete', severity=0) return inverted_index
def start(self): """Start main loop of crawling. Crawl 10 webpages, send documents to database, index them and save the configurations (line number in links file, ...). Send the inverted-index and check for suggestions each 500 crawled webpages. Do it until the user want stop crawling or occured an error. """ module.tell('Starting with base urls') self.get_inverted_index() if not path.exists(data.FILE_LINKS): links.save_domains([{ 'domain': '', 'level': -1, 'completed': 0, 'line': 1, 'file': 0 }]) run = True while run: stats_crawl = time() self.suggestions() for _ in range(self.l1): module.tell('Crawl', severity=2) begining = time() while len(self.infos) < self.l2: begining = time() # Start of crawling loop url = self.file_manager.get_url() if url == 'error': module.safe_quit() result = self.crawl_webpage(url) # result[0]: webpage_infos, result[1]: links if result: self.infos.append(result[0]) # save links and get next url: self.file_manager.save_links(result[1]) with open(data.DIR_STATS + 'stat_crawl_one_webpage', 'a') as myfile: myfile.write(str(time() - begining) + '\n') # End of crawling loop module.tell('{} new documents!'.format(self.crawled_websites), severity=-1) self.send_to_db() self.indexing() module.stats_webpages(begining, time()) self.infos.clear( ) # Reset the list of dict of informations of websites. self.file_manager.check_stop_crawling() self.file_manager.save_config() if self.file_manager.run == 'false': module.tell('User wants stop program') module.safe_quit() # End of loop range(n) self.suggestions() self.send_inverted_index() self.file_manager.check_size_files() module.stats_crawl(stats_crawl, time())
def upload(self, language, first_letter, two_letters, index): FTP_INDEX = language + '/' + first_letter + '/' + two_letters + '.sif' tell('uploading {} in {}'.format(self.DIR_INDEX + FTP_INDEX, two_letters + '.sif')) self.put(self.DIR_INDEX + FTP_INDEX, two_letters + '.sif') self.downuploaded_files += 1
def send_inverted_index(self, inverted_index): """Send inverted-index. :param inverted_index: inverted-index to send :type inverted_index: dict :return: True if an error occured """ tell('send inverted-index') self.downuploaded_files = 0 self.nb_files = count_files_index(inverted_index) # Count files from index (prepare to upload) self.connection() files = self.listdir() if self.FTP_INDEX not in files: self.mkdir(self.FTP_INDEX) self.cd(self.FTP_INDEX) tell('go to ' + self.FTP_INDEX) for language in inverted_index: list_language = self.listdir() if language not in list_language: self.mkdir(language) self.cd(language) tell('go to ' + language) for first_letter in inverted_index[language]: self.tell_progress() list_first_letter = self.listdir() if first_letter not in list_first_letter: self.mkdir(first_letter) self.cd(first_letter) tell('go to ' + first_letter) for two_letters in inverted_index[language][first_letter]: index = inverted_index[language][first_letter][two_letters] self.upload(language, first_letter, two_letters, index) self.cd('..') tell('go back') self.cd('..') tell('go back') self.disconnect() tell('Transfer complete', severity=0) return False
def get_infos(self, url, code, nofollow, score): """Manage all searches of webpage's informations. :param url: url of webpage :type url: str :param score: score of webpage :type score: int :param code: source code of webpage :type code: str :param nofollow: if we take links of webpage :type nofollow: bool :return: links, title, description, key words, language, score, number of words """ results = dict() results['homepage'] = 1 if searches.is_homepage(url) else 0 self.parser.feed(code) results['title'] = searches.clean_text( searches.capitalize(self.parser.title)) # Find title and clean it keywords = searches.clean_text(self.parser.keywords.lower()).split() # Language: if self.parser.language != '': language = self.parser.language score += 1 else: language = self.detect_language(keywords) if language in self.STOPWORDS and self.parser.title != '': keywords = self.clean_keywords(keywords, language) keywords.extend( self.clean_keywords(results['title'].lower().split(), language)) infos_url = urlparse(url) path_position = infos_url.path.rfind('.') path = infos_url.path[:path_position] keywords.extend(self.clean_keywords(path, language)) results['sanesearch'] = self.sane_search(keywords, language) results['language'] = language results['keywords'] = keywords # Description: if self.parser.description == '': results['description'] = searches.clean_text( searches.capitalize(self.parser.first_title)) else: results['description'] = searches.clean_text( searches.capitalize(self.parser.description)) # Css: if self.parser.css: score += 1 base_url = searches.get_base_url(url) # Links: if nofollow: links = list() else: links = data_processing.clean_links(self.parser.links, base_url) searches.stats_links(len(links)) if self.parser.favicon != '': results['favicon'] = self.clean_favicon( self.parser.favicon, base_url) else: results['favicon'] = '' else: tell('No language or title', severity=0) results = {'title': ''} links = list() results['language'] = '*' results['score'] = score return results, links