def main(database: str, url_list_file: str): big_word_list = [] print("we are going to work with " + database) print("we are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) print(len(big_word_list)) # database code os.chdir(os.path.dirname(__file__)) # make sure the db file is in the same directory of the .py file path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, word_list=big_word_list) filename = 'docfile.docx' text_data = read_word.getText(filename) print(text_data)
def main(database: str, url_list_file: str): big_word_list = [] print('我们将使用数据库:' + database) print('我们将抓取以下页面:' + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print('当前:' + url) page_content = url_utilities.load_page(url) words = url_utilities.scrape_page(page_content) big_word_list.extend(words)
def main(database: str, url_list_file: str): big_word_list = [] print('we are going to work with ' + database) print('we are going to scan ' + url_list_file) urls = url_utilities.load_url_from_files(url_list_file) for url in urls: print(f'reading {url}') page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words)
def main(database: str, url_list_file: str): big_word_list = [] print("we are going work with " + database) print("we are going scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words)
def main(database: str, url_list_file: str): print( "We are going to work with ", database) # the tutoring videos uses plus while here a comma responds print("We are going to scan ", url_list_file) # the plus gives erorr urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading ", url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_content=page_content) big_word_list.extend(words)
def main(database, url_list_file): big_word_list = [] print('Db: ' + database) print('input list: ' + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print('reading: ', url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) database_utilities.create_database(database) print('length words is: ', len(big_word_list)) database_utilities.save_words_to_database(big_word_list[:250000])
def main(database: str, url_list_file: str): big_word_list = [] print("we are going to work with " + database) print("we are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_content) big_word_list.extend(words) os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print("We are gonna work with: " + database + " DB") print("We are gonna scan: " + url_list_file + " URL file") urls = url_utils.load_urls_from_file(url_list_file) for url in urls: print("reading: " + url) page_content = url_utils.load_page( url=url) # Using the load_page function in url_ulitiliesi # to open the url contents and decode using utf-8 words = url_utils.scrape_page( page_content) #filtering out numbers, single letter words etc big_word_list.extend( words ) #Note: append adds an object, extend adds an iterable object i.e. list
def main(database: str, url_list_file: str): big_word_list = [] print('We are going to work with ' + database) print('We will scan ' + url_list_file) urls = url_utilities.load_urls_from_files(url_list_file) for url in urls: print('Reading ' + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) # database code os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), 'words.db') database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print('we are going to work with ' + database) print('we are going to scan ' + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) # database code - mitigating cross-platform file path issues # dunder __file__ gives the location of the file we're currently running os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print("We are going to work with " + database) print("We are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page( page_contents=page_content) # this gives a raw list of words big_word_list.extend(words) # database code # this is a cross platform and so path mechanism is important os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print("wer are going to work with " + database) print("wer are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) # database code # 1: manage the issue of path to be multi platform os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") # 2: create the database database.create_database(database_path=path) # 3: save list of words into the database database.save_words_to_data_base(database_path=path, words_list=big_word_list) # 4: report top 10 words by count top_list = database.print_database_table(database_path=path) print('Top 10 words:') for item in top_list: print(item)
def test_load_page(self): html = load_page(self.url1) self.assertTrue(html)
def test_load_page_missing_url(self): html = load_page('') assert html == False
def test_load_page(): html = load_page('https://en.wikipedia.org/wiki/Guido_van_Rossum') assert (len(html) > 0)
def test_scrape_page(self): html = load_page(self.firstURL) clean_words = scrape_page(page_contents=str(html)) assert clean_words != False
def test_load_page(self): html = load_page(self.firstURL) assert html != False
def test_scrape_page(self): html = load_page(self.url1) clean_words = scrape_page(page_contents=html) self.assertTrue(clean_words)
def test_load_page_url_not_found(self): html = load_page("https//:www.bad_url.com") self.assertFalse(html)
def test_load_page_missing_url(self): html = load_page('') self.assertFalse(html)