Beispiel #1
0
 def test_getPageText(self):
     self.assertEqual(
         search.getPageText(
             "https://api.twitter.com/1.1/search/tweets.json"), ' ')
    # Prints: "1 out of 427 : PLANT_NAME"
    print(index+1, "out of", len(plants), ":", plant)

    # Get a list of urls
    url1 = search.getUrls(query, "google", verbose=True)
    url2 = search.getUrls(query, "duckduckgo", verbose=True)
    urlList = [url1, url2]
    urls = list(set().union(*urlList)) # union() removes duplicates

    # Search the urls for occurrences of artist names

    ## Open each url and add to one string HTMLOfPages
    HTMLOfPages = ""
    for url in urls :
        html = search.getPageText(url, verbose=True, timeout=3)
        HTMLOfPages += html
    print("Finished downloading all pages. Searching for artists...")

    # There is an artist named "Erro" and he get's matched for every single
    # "error" in the site text. Since no artist has "error" in their name
    # we can safely get rid of "error" strings without messing up the results.
    pattern = re.compile("error", re.IGNORECASE)
    HTMLOfPages = pattern.sub("", HTMLOfPages.lower())

    ## For each artist, count the number of occurrences that artist has in the
    ## file and add it to an array (counter) with it's index corresponding to
    ## the index of that artist
    counter = []
    for index, artist in enumerate(artists, start=0):
        count = HTMLOfPages.count(artist.lower())
 def test_getPageText(self) :
     self.assertEqual(search.getPageText("https://api.twitter.com/1.1/search/tweets.json"),' ')
Beispiel #4
0
index = {}

indexedTerms = []

stopWords = search.getStopWords('english')

for fileName in fileNames:

    pageCounter += 1

    pageContent = search.getPageContent(fileName)

    soup = search.getSoup(pageContent)

    pageText = search.getPageText(soup)

    tokens = search.getTokens(pageText)

    print('Indexing page #' + str(pageCounter) + ': ' + fileName)

    search.indexPage(tokens, stopWords, index, pageCounter, indexedTerms)

sortedIndex = save.sortIndex(index)

save.saveFile('Index', '.txt', str(sortedIndex))

indexedTermsToSave = save.arrayToSave(indexedTerms)

save.saveFile('IndexedTerms', '.txt', indexedTermsToSave)