Esempio n. 1
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("we are going to work with " + database)
    print("we are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)

    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    print(len(big_word_list))

    # database code
    os.chdir(os.path.dirname(__file__))
    # make sure the db file is in the same directory of the .py file
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              word_list=big_word_list)

    filename = 'docfile.docx'
    text_data = read_word.getText(filename)
    print(text_data)
Esempio n. 2
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print('我们将使用数据库:' + database)
    print('我们将抓取以下页面:' + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print('当前:' + url)
        page_content = url_utilities.load_page(url)
        words = url_utilities.scrape_page(page_content)
        big_word_list.extend(words)
Esempio n. 3
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print('we are going to work with ' + database)
    print('we are going to scan ' + url_list_file)
    urls = url_utilities.load_url_from_files(url_list_file)
    for url in urls:
        print(f'reading {url}')
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)
Esempio n. 4
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("we are going work with " + database)
    print("we are going scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)
Esempio n. 5
0
def main(database: str, url_list_file: str):
    print(
        "We are going to work with ",
        database)  # the tutoring videos uses plus while here a comma responds
    print("We are going to scan ", url_list_file)  # the plus gives erorr
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading ", url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_content=page_content)
        big_word_list.extend(words)
Esempio n. 6
0
def main(database, url_list_file):
    big_word_list = []
    print('Db: ' + database)
    print('input list: ' + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print('reading: ', url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    database_utilities.create_database(database)
    print('length words is: ', len(big_word_list))
    database_utilities.save_words_to_database(big_word_list[:250000])
Esempio n. 7
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("we are going to work with " + database)
    print("we are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_content)
        big_word_list.extend(words)
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words=big_word_list)
Esempio n. 8
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("We are gonna work with: " + database + " DB")
    print("We are gonna scan: " + url_list_file + " URL file")
    urls = url_utils.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading: " + url)
        page_content = url_utils.load_page(
            url=url)  # Using the load_page function in url_ulitiliesi
        # to open the url contents and decode using utf-8
        words = url_utils.scrape_page(
            page_content)  #filtering out numbers, single letter words etc
        big_word_list.extend(
            words
        )  #Note: append adds an object, extend adds an iterable object i.e. list
Esempio n. 9
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print('We are going to work with ' + database)
    print('We will scan ' + url_list_file)
    urls = url_utilities.load_urls_from_files(url_list_file)
    for url in urls:
        print('Reading ' + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    # database code
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), 'words.db')
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words_list=big_word_list)
Esempio n. 10
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print('we are going to work with ' + database)
    print('we are going to scan ' + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    # database code - mitigating cross-platform file path issues
    # dunder __file__ gives the location of the file we're currently running
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words_list=big_word_list)
Esempio n. 11
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("We are going to work with " + database)
    print("We are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(
            page_contents=page_content)  # this gives a raw list of words
        big_word_list.extend(words)

    # database code
    # this is a cross platform and so path mechanism is important
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words_list=big_word_list)
Esempio n. 12
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("wer are going to work with " + database)
    print("wer are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    # database code
    # 1: manage the issue of path to be multi platform
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    # 2: create the database
    database.create_database(database_path=path)
    # 3: save list of words into the database
    database.save_words_to_data_base(database_path=path, words_list=big_word_list)
    # 4: report top 10 words by count
    top_list = database.print_database_table(database_path=path)
    print('Top 10 words:')
    for item in top_list:
        print(item)
Esempio n. 13
0
 def test_load_page(self):
     html = load_page(self.url1)
     self.assertTrue(html)
Esempio n. 14
0
 def test_load_page_missing_url(self):
     html = load_page('')
     assert html == False
Esempio n. 15
0
def test_load_page():
    html = load_page('https://en.wikipedia.org/wiki/Guido_van_Rossum')
    assert (len(html) > 0)
Esempio n. 16
0
 def test_scrape_page(self):
     html = load_page(self.firstURL)
     clean_words = scrape_page(page_contents=str(html))
     assert clean_words != False
Esempio n. 17
0
 def test_load_page(self):
     html = load_page(self.firstURL)
     assert html != False
Esempio n. 18
0
 def test_scrape_page(self):
     html = load_page(self.url1)
     clean_words = scrape_page(page_contents=html)
     self.assertTrue(clean_words)
Esempio n. 19
0
 def test_load_page_url_not_found(self):
     html = load_page("https//:www.bad_url.com")
     self.assertFalse(html)
Esempio n. 20
0
 def test_load_page_missing_url(self):
     html = load_page('')
     self.assertFalse(html)