def test_read_file_with_urls_missing_file(self):
     try:
         load_urls_from_file("urls0.txt")
         assert False
         return
     except:
         print("fred was here")
Exemple #2
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("we are going to work with " + database)
    print("we are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)

    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    print(len(big_word_list))

    # database code
    os.chdir(os.path.dirname(__file__))
    # make sure the db file is in the same directory of the .py file
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              word_list=big_word_list)

    filename = 'docfile.docx'
    text_data = read_word.getText(filename)
    print(text_data)
def test_load_file():
    """
    test the load_urls_from file() method and check if the read string is longer than one char
    """

    test_urls = load_urls_from_file(
        "/Users/zhaohuiliang/PycharmProjects/PageSpider/input.txt")
    assert (len(test_urls) > 1)
    def test_read_file_with_1url(self, mock_open, monkeypatch):
        self.urls = [self.firstURL]
        mock_exists = MagicMock(return_value=True)
        monkeypatch.setattr("os.path.exists", mock_exists)

        urls = load_urls_from_file("urls1.txt")
        mock_open.assert_called_once_with("urls1.txt", "r")
        assert urls == self.firstURL
Exemple #5
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print('我们将使用数据库:' + database)
    print('我们将抓取以下页面:' + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print('当前:' + url)
        page_content = url_utilities.load_page(url)
        words = url_utilities.scrape_page(page_content)
        big_word_list.extend(words)
def main(database: str, url_list_file: str):
    big_word_list = []
    print("we are going work with " + database)
    print("we are going scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)
Exemple #7
0
def main(database: str, url_list_file: str):
    print(
        "We are going to work with ",
        database)  # the tutoring videos uses plus while here a comma responds
    print("We are going to scan ", url_list_file)  # the plus gives erorr
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading ", url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_content=page_content)
        big_word_list.extend(words)
Exemple #8
0
def main(database, url_list_file):
    big_word_list = []
    print('Db: ' + database)
    print('input list: ' + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print('reading: ', url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    database_utilities.create_database(database)
    print('length words is: ', len(big_word_list))
    database_utilities.save_words_to_database(big_word_list[:250000])
Exemple #9
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("we are going to work with " + database)
    print("we are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_content)
        big_word_list.extend(words)
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words=big_word_list)
Exemple #10
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("We are gonna work with: " + database + " DB")
    print("We are gonna scan: " + url_list_file + " URL file")
    urls = url_utils.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading: " + url)
        page_content = url_utils.load_page(
            url=url)  # Using the load_page function in url_ulitiliesi
        # to open the url contents and decode using utf-8
        words = url_utils.scrape_page(
            page_content)  #filtering out numbers, single letter words etc
        big_word_list.extend(
            words
        )  #Note: append adds an object, extend adds an iterable object i.e. list
Exemple #11
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print('we are going to work with ' + database)
    print('we are going to scan ' + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    # database code - mitigating cross-platform file path issues
    # dunder __file__ gives the location of the file we're currently running
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words_list=big_word_list)
Exemple #12
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("We are going to work with " + database)
    print("We are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(
            page_contents=page_content)  # this gives a raw list of words
        big_word_list.extend(words)

    # database code
    # this is a cross platform and so path mechanism is important
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words_list=big_word_list)
Exemple #13
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("wer are going to work with " + database)
    print("wer are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    # database code
    # 1: manage the issue of path to be multi platform
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    # 2: create the database
    database.create_database(database_path=path)
    # 3: save list of words into the database
    database.save_words_to_data_base(database_path=path, words_list=big_word_list)
    # 4: report top 10 words by count
    top_list = database.print_database_table(database_path=path)
    print('Top 10 words:')
    for item in top_list:
        print(item)
Exemple #14
0
def test_load_file():
    test_urls = load_urls_from_file("input.txt")
    assert (len(test_urls) > 1)
 def test_read_file_with_urls_none_in_file(self):
     urls = load_urls_from_file("urls0.txt")
     assert [] == urls
 def test_read_file_with_2urls(self):
     urls = load_urls_from_file("urls.txt")
     assert 2 == len(urls)
Exemple #17
0
 def test_read_file_with_urls(self):
     urls = load_urls_from_file("urls.txt")
     self.assertEqual(2, len(urls))
Exemple #18
0
 def test_read_file_with_urls_none_in_file(self):
     self.urls = load_urls_from_file("urls0.txt")
     self.assertEqual([], self.urls)
Exemple #19
0
 def test_read_file_with_urls_missing_file(self):
     self.urls = load_urls_from_file("missing_file.txt")
     self.assertEqual(0, len(self.urls))
def test_load_file():
    test_urls = load_urls_from_file("C:/Users/aviadp/PycharmProjects/PageSpider/input.txt")
    assert (len(test_urls) > 1)
def test_load_file():
    test_urls = load_urls_from_file('input.txt')
    assert (len(test_urls) == -1)
Exemple #22
0
 def test_load_file(self):
     test_urls = load_urls_from_file("../../input.txt")
     assert (len(test_urls) > 1)