def test_getPageText(self): self.assertEqual( search.getPageText( "https://api.twitter.com/1.1/search/tweets.json"), ' ')
# Prints: "1 out of 427 : PLANT_NAME" print(index+1, "out of", len(plants), ":", plant) # Get a list of urls url1 = search.getUrls(query, "google", verbose=True) url2 = search.getUrls(query, "duckduckgo", verbose=True) urlList = [url1, url2] urls = list(set().union(*urlList)) # union() removes duplicates # Search the urls for occurrences of artist names ## Open each url and add to one string HTMLOfPages HTMLOfPages = "" for url in urls : html = search.getPageText(url, verbose=True, timeout=3) HTMLOfPages += html print("Finished downloading all pages. Searching for artists...") # There is an artist named "Erro" and he get's matched for every single # "error" in the site text. Since no artist has "error" in their name # we can safely get rid of "error" strings without messing up the results. pattern = re.compile("error", re.IGNORECASE) HTMLOfPages = pattern.sub("", HTMLOfPages.lower()) ## For each artist, count the number of occurrences that artist has in the ## file and add it to an array (counter) with it's index corresponding to ## the index of that artist counter = [] for index, artist in enumerate(artists, start=0): count = HTMLOfPages.count(artist.lower())
def test_getPageText(self) : self.assertEqual(search.getPageText("https://api.twitter.com/1.1/search/tweets.json"),' ')
index = {} indexedTerms = [] stopWords = search.getStopWords('english') for fileName in fileNames: pageCounter += 1 pageContent = search.getPageContent(fileName) soup = search.getSoup(pageContent) pageText = search.getPageText(soup) tokens = search.getTokens(pageText) print('Indexing page #' + str(pageCounter) + ': ' + fileName) search.indexPage(tokens, stopWords, index, pageCounter, indexedTerms) sortedIndex = save.sortIndex(index) save.saveFile('Index', '.txt', str(sortedIndex)) indexedTermsToSave = save.arrayToSave(indexedTerms) save.saveFile('IndexedTerms', '.txt', indexedTermsToSave)