def test_read_file_with_urls_missing_file(self): try: load_urls_from_file("urls0.txt") assert False return except: print("fred was here")
def main(database: str, url_list_file: str): big_word_list = [] print("we are going to work with " + database) print("we are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) print(len(big_word_list)) # database code os.chdir(os.path.dirname(__file__)) # make sure the db file is in the same directory of the .py file path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, word_list=big_word_list) filename = 'docfile.docx' text_data = read_word.getText(filename) print(text_data)
def test_load_file(): """ test the load_urls_from file() method and check if the read string is longer than one char """ test_urls = load_urls_from_file( "/Users/zhaohuiliang/PycharmProjects/PageSpider/input.txt") assert (len(test_urls) > 1)
def test_read_file_with_1url(self, mock_open, monkeypatch): self.urls = [self.firstURL] mock_exists = MagicMock(return_value=True) monkeypatch.setattr("os.path.exists", mock_exists) urls = load_urls_from_file("urls1.txt") mock_open.assert_called_once_with("urls1.txt", "r") assert urls == self.firstURL
def main(database: str, url_list_file: str): big_word_list = [] print('我们将使用数据库:' + database) print('我们将抓取以下页面:' + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print('当前:' + url) page_content = url_utilities.load_page(url) words = url_utilities.scrape_page(page_content) big_word_list.extend(words)
def main(database: str, url_list_file: str): big_word_list = [] print("we are going work with " + database) print("we are going scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words)
def main(database: str, url_list_file: str): print( "We are going to work with ", database) # the tutoring videos uses plus while here a comma responds print("We are going to scan ", url_list_file) # the plus gives erorr urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading ", url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_content=page_content) big_word_list.extend(words)
def main(database, url_list_file): big_word_list = [] print('Db: ' + database) print('input list: ' + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print('reading: ', url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) database_utilities.create_database(database) print('length words is: ', len(big_word_list)) database_utilities.save_words_to_database(big_word_list[:250000])
def main(database: str, url_list_file: str): big_word_list = [] print("we are going to work with " + database) print("we are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_content) big_word_list.extend(words) os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print("We are gonna work with: " + database + " DB") print("We are gonna scan: " + url_list_file + " URL file") urls = url_utils.load_urls_from_file(url_list_file) for url in urls: print("reading: " + url) page_content = url_utils.load_page( url=url) # Using the load_page function in url_ulitiliesi # to open the url contents and decode using utf-8 words = url_utils.scrape_page( page_content) #filtering out numbers, single letter words etc big_word_list.extend( words ) #Note: append adds an object, extend adds an iterable object i.e. list
def main(database: str, url_list_file: str): big_word_list = [] print('we are going to work with ' + database) print('we are going to scan ' + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) # database code - mitigating cross-platform file path issues # dunder __file__ gives the location of the file we're currently running os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print("We are going to work with " + database) print("We are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page( page_contents=page_content) # this gives a raw list of words big_word_list.extend(words) # database code # this is a cross platform and so path mechanism is important os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print("wer are going to work with " + database) print("wer are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) # database code # 1: manage the issue of path to be multi platform os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") # 2: create the database database.create_database(database_path=path) # 3: save list of words into the database database.save_words_to_data_base(database_path=path, words_list=big_word_list) # 4: report top 10 words by count top_list = database.print_database_table(database_path=path) print('Top 10 words:') for item in top_list: print(item)
def test_load_file(): test_urls = load_urls_from_file("input.txt") assert (len(test_urls) > 1)
def test_read_file_with_urls_none_in_file(self): urls = load_urls_from_file("urls0.txt") assert [] == urls
def test_read_file_with_2urls(self): urls = load_urls_from_file("urls.txt") assert 2 == len(urls)
def test_read_file_with_urls(self): urls = load_urls_from_file("urls.txt") self.assertEqual(2, len(urls))
def test_read_file_with_urls_none_in_file(self): self.urls = load_urls_from_file("urls0.txt") self.assertEqual([], self.urls)
def test_read_file_with_urls_missing_file(self): self.urls = load_urls_from_file("missing_file.txt") self.assertEqual(0, len(self.urls))
def test_load_file(): test_urls = load_urls_from_file("C:/Users/aviadp/PycharmProjects/PageSpider/input.txt") assert (len(test_urls) > 1)
def test_load_file(): test_urls = load_urls_from_file('input.txt') assert (len(test_urls) == -1)
def test_load_file(self): test_urls = load_urls_from_file("../../input.txt") assert (len(test_urls) > 1)